In [29]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize

# Download the necessary NLTK data
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load the dataset
df = pd.read_csv('sentiment-emotion-labelled_Dell_tweets.csv')

# Text preprocessing
stop_words = set(stopwords.words('english'))
stemmer = SnowballStemmer('english')

def preprocess_text(text):
    if not isinstance(text, str):
        return ''
    # Remove special characters, preserving alphanumeric and space characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Tokenize and stem
    words = word_tokenize(text)
    words = [stemmer.stem(word) for word in words if word not in stop_words]
    return ' '.join(words)

df['Text'] = df['Text'].apply(preprocess_text)

# Preprocess the data (convert text to numbers)
vectorizer = TfidfVectorizer(use_idf=True, ngram_range=(1, 2)) # Use 1-gram + 2-gram
X = vectorizer.fit_transform(df['Text'])
y = df['sentiment']

# Split the dataset into a training set and a test set (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.025, random_state=40)

# Define the model pipeline and the parameters grid for the grid search
# We are also testing different models
pipelines = [
    ('lsvc', Pipeline([('clf', LinearSVC(max_iter=5000))]), {'clf__C': (0.1, 1, 10)}) # increased iterations and added more C values
]

for model_name, pipeline, parameters in pipelines:
    # Grid search
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, cv=3) 

    # Train the model
    grid_search.fit(X_train, y_train)

    # Test the model
    y_pred = grid_search.predict(X_test)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f'{model_name} accuracy is: {accuracy*100:.2f}%')




[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ashrafkhalil/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ashrafkhalil/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ashrafkhalil/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Fitting 3 folds for each of 3 candidates, totalling 9 fits




lsvc accuracy is: 80.00%
