In [43]:
import pandas as pd
#used to load and work on the dataset
from sklearn.model_selection import train_test_split
#used to split the data into training and testing sets
from sklearn.feature_extraction.text import TfidfVectorizer
#used to convert text data into numerical features
from sklearn.linear_model import LogisticRegression
#used to create a logistic regression model used for sentiment classification
from sklearn.metrics import accuracy_score
#used to calculate the accuracy of the model(evaluate the performance)
import numpy as np
#used to perform mathematical operations

# Load the sentiment dataset
df = pd.read_csv("train1.csv", encoding='latin-1')

# Replace empty strings or spaces with NaN
df.replace(r'^\s*$', np.nan, regex=True, inplace=True)

#Removes any rows that have missing values in either the sentiment or text columns.
df.dropna(subset=["sentiment", "text"], inplace=True)

# Normalize Sentiment labels to integers, handling leading/trailing spaces
# Strip leading/trailing spaces and assigns:1 for "positive",0 for "negative",2 for "neutral"
df.loc[df["sentiment"].str.strip() == "positive", "sentiment"] = 1
df.loc[df["sentiment"].str.strip() == "negative", "sentiment"] = 0
df.loc[df["sentiment"].str.strip() == "neutral", "sentiment"] = 2 

# Convert Sentiment column to integer type
df["sentiment"] = df["sentiment"].astype(int)

# Define features and labels
x = df["text"]  # Change to the name of the text column
y = df["sentiment"]

# Split the dataset into training(80%) and testing sets(20%)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=3)

# Feature extraction
feature_extraction = TfidfVectorizer(min_df=1, stop_words="english", lowercase=True)
'''TfidfVectorizer: Converts the text data into numerical vectors using the TF-IDF (Term Frequency-Inverse Document Frequency) method:
--min_df=1: Ignores words that occur in fewer than 1 document (i.e., include all words).
--stop_words="english": Ignores common English stop words (like "the", "is", etc.).
--lowercase=True: Converts all text to lowercase. '''
x_train_features = feature_extraction.fit_transform(x_train)
#Fits the vectorizer on x_train and transforms the text into a TF-IDF matrix.
x_test_features = feature_extraction.transform(x_test)
#Transforms x_test using the same TF-IDF vectorizer (fitted on the training set).

# Convert labels to integer type 
y_train = y_train.astype(int)
y_test = y_test.astype(int)

# Train the model
model = LogisticRegression()
model.fit(x_train_features, y_train)
 #Trains the logistic regression model using the training data

# Predictions on training and test data
prediction_on_training_data = model.predict(x_train_features)
accuracy_on_training_data = accuracy_score(y_train, prediction_on_training_data)
prediction_on_test_data = model.predict(x_test_features)
accuracy_on_test_data = accuracy_score(y_test, prediction_on_test_data)
# Trains the logistic regression model using the training data
#Computes the accuracy of the model by comparing predictions to the actual labels (y_train, y_test).


# Input for sentiment analysis
post = str(input("Enter your post: ")).strip()  # Strip leading/trailing spaces

# Prepare the input for prediction
input_post = [post]
input_data_features = feature_extraction.transform(input_post)
prediction = model.predict(input_data_features)
#Transforms the input post into a TF-IDF matrix using the same vectorizer (fitted)

# Output the result
if prediction == 1:
    print("Positive post")
elif prediction == 0:
    print("Negative post")
else:
    print("Neutral post")


Neutral post
