PREPROCESSING


In [None]:
import re
import nltk
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('punkt_tab')

## Cleaning and Tokenization
def process_text(text):
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    text = text.lower().strip()
    tokens = word_tokenize(text)
    return ' '.join(tokens)

#Convert Text to Numerical
def vector_text(train_text, test_text):
    vectorizer = TfidfVectorizer(max_features= 5000)
    X_train = vectorizer.fit_transform(train_text)
    X_test = vectorizer.fit_transform(test_text)
    return X_train, X_test, vectorizer

TRAIN


In [None]:
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from preprocess import process_text
from sklearn.feature_extraction.text import TfidfVectorizer

#Load Dataset
file = 'C:/Users/kelly/PycharmProjects/PythonProject1/data/Sentences_AllAgree.txt'
sentences = [ ]
labels = [ ]

with open(file, 'r', encoding='ISO-8859-1') as file:
    for line in file:
        parts = line.rsplit('@, 1')
        if len(parts)== 2:
            sentences.append(parts[0].strip())
            labels.append(parts[1].strip())


#Convert Labels into Numeric
label_mapping = {'positive' : 1,
                 'negative':-1,
                 'neutral': 0}
y =[label_mapping[label] for  label in labels]

#Preprocess Text
sentences = [process_text(sentence) for sentence in sentences]

#Spliting the data into train and test


x_train, x_test, y_train, y_test = train_test_split(sentences, y, test_size=0.2, random_state=42)

from collections import Counter
print('Label Information: ', Counter(y_train))

#Convert text to numerical features
vectorizer = TfidfVectorizer(max_features=10000)

#Words embedding
from sentence_transformers import SentenceTransformer

embed_model = SentenceTransformer('all-MiniLM-L6-v2')
x_train_vec = embed_model.encode(x_train)
x_test_vec = embed_model.encode(x_test)

from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)
x_train_imb, y_train_imb = smote.fit_resample(x_train_vec, y_train)

from collections import Counter
print('Label Information: ', Counter(y_train_imb))

#Train model using SVC
model_svc = SVC(kernel='linear')
model_svc.fit(x_train_imb,y_train_imb)

#Evaluation
y_pred_svc = model_svc.predict(x_test_vec)
accuracy_svc = accuracy_score(y_test, y_pred_svc)
print(f'SVC Model Accuracy: {accuracy_svc}')

with open('C:/Users/kelly/PycharmProjects/PythonProject1/sentiment.pkl', 'wb') as f:
    pickle.dump(model_svc, f)

with open('C:/Users/kelly/PycharmProjects/PythonProject1/transformer.pkl', 'wb') as f:
    pickle.dump(embed_model, f)

PREDICT

In [None]:
import pickle
from preprocess import process_text

#Load the Models
with open('C:/Users/Admin/Desktop/Financial Analysis/sentiment.pkl', 'rb') as f:
    model_svc = pickle.load(f)

with open('C:/Users/Admin/Desktop/Financial Analysis/transformer.pkl', 'rb') as f:
     embed_model = pickle.load(f)


#To predict the test

def predict(text):
    processed_text = process_text(text)
    vectorized_text = embed_model.encode([processed_text])
    prediction = model_svc.predict(vectorized_text)[0]
    sentiment_map = {'positive': 1,
                 'negative': 1,
                 'neutral': 0}
    return sentiment_map[prediction]


#Test the function
if __name__ == '__main__':
    user_text = input('Enter the sentence: ')
    print('Sentiments:', predict(user_input))

