# **Pre-processing**

In [2]:
# Import cleaned csv files
import pandas as pd

training = pd.read_csv('../data/training_data_cleaned.csv')
validation = pd.read_csv('../data/validation_data_cleaned.csv')

In [3]:
training.head()

Unnamed: 0,Label,Content
0,Positive,im getting on borderlands and i will murder yo...
1,Positive,I am coming to the borders and I will kill you...
2,Positive,im getting on borderlands and i will kill you ...
3,Positive,im coming on borderlands and i will murder you...
4,Positive,im getting on borderlands 2 and i will murder ...


In [4]:
validation.head()

Unnamed: 0,Label,Content
0,Irrelevant,I mentioned on Facebook that I was struggling ...
1,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
2,Negative,@Microsoft Why do I pay for WORD when it funct...
3,Negative,"CSGO matchmaking is so full of closet hacking,..."
4,Neutral,Now the President is slapping Americans in the...


In [5]:
# Remove html
from bs4 import BeautifulSoup

def remove_html(html_content):
    try:
        # Init BeautifulSoup object
        soup = BeautifulSoup(html_content, "html.parser")
        
        # Get text
        text = soup.get_text(separator=' ', strip=True)
        
        return text
    except Exception as e:
        print(f"Error removing html: {e}")
        return html_content

In [6]:
# Warning is okay for now
training.Content = training.Content.apply(remove_html)
print(training.Content.head(2))

  soup = BeautifulSoup(html_content, "html.parser")


0    im getting on borderlands and i will murder yo...
1    I am coming to the borders and I will kill you...
Name: Content, dtype: object


In [7]:
validation.Content = validation.Content.apply(remove_html)
display(validation.Content.head(2))

  soup = BeautifulSoup(html_content, "html.parser")


0    I mentioned on Facebook that I was struggling ...
1    BBC News - Amazon boss Jeff Bezos rejects clai...
Name: Content, dtype: object

In [8]:
# Clean text using various regular expressions
import re

def clean_text(text):
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)  # Remove URLs
    text = re.sub(r'\@\w+|\#','', text)  # Remove mentions and hashtags
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

In [9]:
training.Content = training.Content.apply(clean_text)

In [10]:
validation.Content = validation.Content.apply(clean_text)

# **Modeling**

In [12]:
# Import relevant libraries
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

In [13]:
# Vectorization
vectorizer = TfidfVectorizer(stop_words=stopwords.words('english'))
X = vectorizer.fit_transform(training.Content)
y = training.Label  # Labels (positive, negative, neutral)

In [14]:
# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

In [15]:
# Model Training
model = MultinomialNB()
model.fit(X_train, y_train)

In [16]:
# Prediction
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.7229974881384315
              precision    recall  f1-score   support

  Irrelevant       0.93      0.43      0.59      2477
    Negative       0.66      0.90      0.76      4316
     Neutral       0.82      0.63      0.71      3559
    Positive       0.70      0.80      0.75      3980

    accuracy                           0.72     14332
   macro avg       0.78      0.69      0.70     14332
weighted avg       0.76      0.72      0.71     14332



In [17]:
# logistic Regression model
model = LogisticRegressionCV(max_iter=1000)
model.fit(X_train, y_train)

In [18]:
# Prediction
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.8531956461066146
              precision    recall  f1-score   support

  Irrelevant       0.87      0.81      0.84      2477
    Negative       0.83      0.90      0.87      4316
     Neutral       0.87      0.84      0.85      3559
    Positive       0.85      0.85      0.85      3980

    accuracy                           0.85     14332
   macro avg       0.86      0.85      0.85     14332
weighted avg       0.85      0.85      0.85     14332



In [19]:
# SVC works better with tfidf vectors
from sklearn.svm import LinearSVC

# Train SVM model
svm_model = LinearSVC(random_state=0, tol=1e-5, dual=True)  # LinearSVM for text classification
svm_model.fit(X_train, y_train.ravel())

# Predictions
y_pred_svm = svm_model.predict(X_test)

# Evaluate model
print("Accuracy:", accuracy_score(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm))

Accuracy: 0.8547306726207089
              precision    recall  f1-score   support

  Irrelevant       0.88      0.80      0.84      2477
    Negative       0.83      0.91      0.87      4316
     Neutral       0.88      0.82      0.85      3559
    Positive       0.85      0.86      0.85      3980

    accuracy                           0.85     14332
   macro avg       0.86      0.85      0.85     14332
weighted avg       0.86      0.85      0.85     14332

