In [None]:
# SVM
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.pipeline import Pipeline

# Download NLTK resources
import nltk
nltk.download('stopwords')
nltk.download('punkt')

# Load data from CSV
data = pd.read_csv('/content/sample_data/dataset_tweet_sentiment.csv')

# Data preprocessing
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess_text(text):
    words = nltk.word_tokenize(text)
    words = [stemmer.stem(word) for word in words if word.isalnum()]
    words = [word.lower() for word in words if word.lower() not in stop_words]
    return ' '.join(words)

data['text'] = data['text'].apply(preprocess_text)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['sentiment'], test_size=0.2, random_state=42)

# Label encoding for sentiment labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Create a pipeline with TF-IDF vectorizer and SVM classifier
svm_model = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('svm', SVC(kernel='linear'))
])

# Train the SVM model
svm_model.fit(X_train, y_train_encoded)

# Predictions
predictions = svm_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test_encoded, predictions)
classification_rep = classification_report(y_test_encoded, predictions)

print(f'Accuracy: {accuracy * 100:.2f}%')
print('\nClassification Report:')
print(classification_rep)

In [None]:
# Logistic Regression

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
import nltk

# Download NLTK resources
nltk.download('stopwords')

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['sentiment'], test_size=0.2, random_state=42)

# Label encoding for sentiment labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Transform the text data into TF-IDF features
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Create a logistic regression model
logistic_regression_model = LogisticRegression()

# Train the logistic regression model
logistic_regression_model.fit(X_train_tfidf, y_train_encoded)

# Predictions
predictions = logistic_regression_model.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test_encoded, predictions)
classification_rep = classification_report(y_test_encoded, predictions)

print(f'Accuracy: {accuracy * 100:.2f}%')
print('\nClassification Report:')
print(classification_rep)

In [None]:
# Random Forest

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['sentiment'], test_size=0.2, random_state=42)

# Convert text data to TF-IDF features
vectorizer = TfidfVectorizer(max_features=5000)  # You can adjust the max_features based on your vocabulary size
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Build and train the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_tfidf, y_train)

# Make predictions on the test set
predictions = rf_model.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
classification_rep = classification_report(y_test, predictions)

print(f'Accuracy: {accuracy * 100:.2f}%')
print('\nClassification Report:')
print(classification_rep)
