In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report
import numpy as np

In [2]:
df = pd.read_csv('cleaned_tweets.csv')

In [3]:
# Create train/test split
train_df, test_df = train_test_split(df, test_size=0.1, shuffle=True)

In [4]:
# Distribution of classes in the train set
print("Train set class distribution:")
print(train_df['airline_sentiment'].value_counts())

# Distribution of classes in the test set
print("\nTest set class distribution:")
print(test_df['airline_sentiment'].value_counts())

Train set class distribution:
airline_sentiment
negative    8157
neutral     2583
positive    1955
Name: count, dtype: int64

Test set class distribution:
airline_sentiment
negative    892
neutral     311
positive    208
Name: count, dtype: int64


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)

# Fit and transform the train and test data
X_train_tfidf = tfidf_vectorizer.fit_transform(train_df['text'])
X_test_tfidf = tfidf_vectorizer.transform(test_df['text'])

In [6]:
from sklearn.linear_model import SGDClassifier

# Initialize the SGDClassifier
sgd_clf = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-4, max_iter=100, tol=None, shuffle=True)

# Train the classifier
sgd_clf.fit(X_train_tfidf, train_df['airline_sentiment'])

# Predict on the test set
y_pred = sgd_clf.predict(X_test_tfidf)

In [7]:
from sklearn.model_selection import cross_val_score

# Perform 10-fold cross-validation
cv_scores = cross_val_score(sgd_clf, X_train_tfidf, train_df['airline_sentiment'], cv=10, scoring='accuracy')

# Calculate the mean accuracy across the 10 folds
mean_cv_accuracy = np.mean(cv_scores)

print(f"Mean cross-validation accuracy: {mean_cv_accuracy:.4f}")

Mean cross-validation accuracy: 0.7821


In [8]:
import re
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_score
import numpy as np
import nltk

# Download stopwords and wordnet
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

# Define preprocessing functions
def lowercase(text):
    return text.lower()

def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    return ' '.join([word for word in text.split() if word not in stop_words])

def lemmatize(text):
    return ' '.join([lemmatizer.lemmatize(word) for word in text.split()])

# Define a function to apply preprocessing actions
def preprocess_text(text, actions):
    for action in actions:
        text = action(text)
    return text

# Define different combinations of preprocessing actions
combinations = [
    [],
    [lowercase],
    [lowercase, remove_punctuation],
    [lowercase, remove_punctuation, remove_stopwords],
    [lowercase, remove_punctuation, remove_stopwords, lemmatize],
    [remove_punctuation],
    [remove_punctuation, remove_stopwords],
    [remove_punctuation, remove_stopwords, lemmatize],
    [remove_stopwords],
    [remove_stopwords, lemmatize],
    [lemmatize]
]

# Evaluate the model's performance for each combination
results = []
for actions in combinations:
    # Preprocess the text data
    train_df['processed_text'] = train_df['text'].apply(lambda x: preprocess_text(x, actions))
    test_df['processed_text'] = test_df['text'].apply(lambda x: preprocess_text(x, actions))
    
    # Initialize the TF-IDF Vectorizer
    tfidf_vectorizer = TfidfVectorizer(max_features=5000)
    
    # Fit and transform the train and test data
    X_train_tfidf = tfidf_vectorizer.fit_transform(train_df['processed_text'])
    X_test_tfidf = tfidf_vectorizer.transform(test_df['processed_text'])
    
    # Initialize the SGDClassifier
    sgd_clf = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-4, max_iter=100, tol=None, shuffle=True)
    
    # Perform 10-fold cross-validation
    cv_scores = cross_val_score(sgd_clf, X_train_tfidf, train_df['airline_sentiment'], cv=10, scoring='accuracy')
    
    # Calculate the mean accuracy across the 10 folds
    mean_cv_accuracy = np.mean(cv_scores)
    
    # Store the results
    results.append((actions, mean_cv_accuracy))

# Print the results
for actions, accuracy in results:
    action_names = [action.__name__ for action in actions]
    print(f"Actions: {action_names}, Mean CV Accuracy: {accuracy:.4f}")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Stell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Stell\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Actions: [], Mean CV Accuracy: 0.7814
Actions: ['lowercase'], Mean CV Accuracy: 0.7818
Actions: ['lowercase', 'remove_punctuation'], Mean CV Accuracy: 0.7817
Actions: ['lowercase', 'remove_punctuation', 'remove_stopwords'], Mean CV Accuracy: 0.7817
Actions: ['lowercase', 'remove_punctuation', 'remove_stopwords', 'lemmatize'], Mean CV Accuracy: 0.7820
Actions: ['remove_punctuation'], Mean CV Accuracy: 0.7816
Actions: ['remove_punctuation', 'remove_stopwords'], Mean CV Accuracy: 0.7817
Actions: ['remove_punctuation', 'remove_stopwords', 'lemmatize'], Mean CV Accuracy: 0.7822
Actions: ['remove_stopwords'], Mean CV Accuracy: 0.7814
Actions: ['remove_stopwords', 'lemmatize'], Mean CV Accuracy: 0.7826
Actions: ['lemmatize'], Mean CV Accuracy: 0.7824


In [9]:
from sklearn.metrics import accuracy_score, confusion_matrix

# Calculate classification accuracy
classification_accuracy = accuracy_score(test_df['airline_sentiment'], y_pred)
print(f"Classification Accuracy: {classification_accuracy:.4f}")

# Print the classification report
print("Classification Report:")
print(classification_report(test_df['airline_sentiment'], y_pred))

# Print the confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(test_df['airline_sentiment'], y_pred))

Classification Accuracy: 0.7796
Classification Report:
              precision    recall  f1-score   support

    negative       0.79      0.94      0.86       892
     neutral       0.69      0.40      0.51       311
    positive       0.78      0.66      0.72       208

    accuracy                           0.78      1411
   macro avg       0.75      0.67      0.69      1411
weighted avg       0.77      0.78      0.76      1411

Confusion Matrix:
[[837  35  20]
 [166 125  20]
 [ 50  20 138]]
