In [37]:
import pandas as pd
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from nltk.sentiment import SentimentIntensityAnalyzer

In [38]:
# Load the dataset from XLSX
data = pd.read_excel('/content/chatgpt1-2.xlsx')  


In [39]:
# Remove rows with missing text
data = data.dropna(subset=['Text'])

In [40]:
# Preprocessing
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')
data['clean_text'] = data['Text'].apply(lambda x: ' '.join([word for word in str(x).split() if word.lower() not in stopwords]))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [41]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(data['clean_text'])

In [42]:
# Download the VADER lexicon
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [43]:
# Initialize VADER sentiment analyzer
sid = SentimentIntensityAnalyzer()

In [44]:
# Add sentiment labels using VADER sentiment analysis
data['sentiment'] = data['clean_text'].apply(lambda x: 'positive' if sid.polarity_scores(x)['compound'] > 0
                                              else 'negative' if sid.polarity_scores(x)['compound'] < 0
                                              else 'neutral')

In [45]:
# Prepare the data for training
X_train, X_test, y_train, y_test = train_test_split(data['clean_text'], data['sentiment'], test_size=0.2, random_state=42)


In [46]:
# Feature extraction
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

In [49]:
# Train the Random Forest classifier
random_forest = RandomForestClassifier(n_estimators=60, random_state=42)
random_forest.fit(X_train_vectorized, y_train)


In [50]:
# Predict sentiment labels for the test set
y_pred = random_forest.predict(X_test_vectorized)

In [51]:
# Evaluate the model
accuracy = (y_pred == y_test).mean()
print(f"Accuracy: {accuracy}")

Accuracy: 0.8418


In [53]:
new_tweet = "This is a great product! #happy"
clean_new_tweet = ' '.join([word for word in new_tweet.split() if word.lower() not in stopwords])
new_tweet_vector = vectorizer.transform([clean_new_tweet])
predicted_sentiment = random_forest.predict(new_tweet_vector)
print(f"Predicted sentiment: {predicted_sentiment}")

Predicted sentiment: ['neutral']
