In [59]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report

In [60]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [88]:
#Load Dataset
data = pd.read_csv("sentiment_analysis.csv")
data.head()

Unnamed: 0,Year,Month,Day,Time of Tweet,text,sentiment,Platform
0,2018,8,18,morning,What a great day!!! Looks like dream.,positive,Twitter
1,2018,8,18,noon,"I feel sorry, I miss you here in the sea beach",positive,Facebook
2,2017,8,18,night,Don't angry me,negative,Facebook
3,2022,6,8,morning,We attend in the class just for listening teac...,negative,Facebook
4,2022,6,8,noon,"Those who want to go, let them go",negative,Instagram


In [89]:
#Keep only required columns
data = data[['text','sentiment']]
data.head()

Unnamed: 0,text,sentiment
0,What a great day!!! Looks like dream.,positive
1,"I feel sorry, I miss you here in the sea beach",positive
2,Don't angry me,negative
3,We attend in the class just for listening teac...,negative
4,"Those who want to go, let them go",negative


In [90]:
#Text Preprocessing
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [91]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    words = text.split()
    words = [lemmatizer.lemmatize(w) for w in words if w not in stop_words]
    return " ".join(words)

data['clean_text'] = data['text'].apply(clean_text)

In [92]:
#Convert Text to (TF-IDF)
vectorizer = TfidfVectorizer(ngram_range=(1,2),max_features=5000)
x = vectorizer.fit_transform(data['clean_text'])
y = data['sentiment']

In [93]:
#Train-Test split
x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=0.2, random_state=42, stratify=y)


In [94]:
#Train the Model
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(class_weight='balanced',max_iter=1000)
model.fit(x_train,y_train)

In [95]:
#Evaluate the Model
y_pred = model.predict(x_test)
accuracy_score(y_test, y_pred)
classification_report(y_test, y_pred)

'              precision    recall  f1-score   support\n\n    negative       0.85      0.41      0.55        27\n     neutral       0.63      0.90      0.74        40\n    positive       0.80      0.73      0.76        33\n\n    accuracy                           0.71       100\n   macro avg       0.76      0.68      0.68       100\nweighted avg       0.75      0.71      0.70       100\n'

In [97]:
#Test on new text
def predict_sentiment(text):
    text = clean_text(text)
    vector = vectorizer.transform([text])
    return model.predict(vector)[0]
predict_sentiment("I love this product")
predict_sentiment("This is very bad")
predict_sentiment("It is okay")


'neutral'