In [None]:
%%capture
!pip install openai
!pip install scikit-learn
!pip install matplotlib
!pip install transformers

import json
import pandas as pd
import openai
import numpy as np
from tqdm import tqdm
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
import transformers
import ast
import pprint
import polars as pl
import gdown

In [None]:
import re
from spacy.lang.en import English
import string
import xgboost
import warnings
# Suppress only specific warning types
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=UserWarning)
# Add
from sklearn.metrics import classification_report

## Text Cleaning

In [None]:
def load_stopwords(filename):
    stopwords = []
    with open(filename, "r") as f:
      stopwords = []
      for line in tqdm(f):
        line = re.sub(r"\n","",line, flags=re.I)
        stopwords.append(line)
      return set(stopwords)

In [None]:
stopwords_file = "/content/drive/Shareddrives/MSML641 Project/msml_641_project_scripts/mallet_en_stoplist.txt"
stopwords= load_stopwords(stopwords_file)
nlp = English(parser=False)
def spacy_preprocessing(text):
    '''
    text: accepts stings text
    stopwords: list of stopwords
    proceduralwords: list of procedural words in politics
    exclude_list: Custom list of words to include ex: ['mr','managers']
    clean_tokens: maps words like you're to you are
    returns a clean string

    Parameters
    remove_punctuations: yes removes all puntuations
    remove_stopwords:  yes removes all stopwords
    remove_nonalpha: yes removes all characters execpt uppercase and lowercase letters
    Example: text = text = "I am soooooo excited Mr. , to learn nlp. s123 2003 you're doing      great. He will be awesome!!   managers for life"

    '''

    exclude_list=[]
    remove_punctuations='no'
    remove_stopwords='no'
    remove_nonalpha='yes'


    #removing any websit
    text = re.sub(r"http[s]://[a-zA-Z.\/0-9?=]*\b", " ", text)

    # replaces special characters with spaces
    if remove_nonalpha == 'yes':
        text = re.sub(r'\b(?![\w\']+)\s*\W+\s*\b', lambda match: ' ' if match.group().strip() else match.group(), text)
        #text = re.sub(r"[^a-zA-Z]", " ", text)

    # replaces multiple character with a word with one like pooooost will be post
    text = re.sub(r"(.)\1{3,}", r"\1", text)

    # replaces multiple space in the line with single space
    text = re.sub(r"\s{2,}", r" ", text)

    clean_text = []

    doc = nlp(text)
    for token in doc:
        if (remove_punctuations == 'yes') & (remove_stopwords == 'yes'):
            if (token.orth_ not in string.punctuation) & (token.orth_.lower() not in stopwords) & (token.orth_.lower() not in exclude_list):
                clean_text.append(token.orth_.lower())
        elif (remove_punctuations == 'yes') & (remove_stopwords == 'no'):
            if (token.orth_ not in string.punctuation):
                clean_text.append(token.orth_.lower())
        elif (remove_punctuations == 'no') & (remove_stopwords == 'yes') & (token.orth_.lower() not in exclude_list):
            if (token.orth_ not in stopwords) & (
                    token.orth_ not in string.punctuation):
                clean_text.append(token.orth_.lower())
        else:
            clean_text.append(token.orth_.lower())
            continue
    clean_string = " ".join(clean_text).lstrip()

    return clean_string

524it [00:00, 145262.08it/s]


## Original Data

In [None]:
%%time
original_data =    (
          pl.scan_csv("/content/drive/MyDrive/data/Tweets.csv")
          .select(pl.col('text'),
                  pl.col('airline_sentiment').alias("label"))
          .collect()
          )

CPU times: user 15.2 ms, sys: 9.68 ms, total: 24.8 ms
Wall time: 49.8 ms


## Cleaned Data

In [None]:
%%time
mapping={"positive":1,"neutral":2,"negative":0}
data =    (
          pl.scan_csv("/content/drive/MyDrive/data/Tweets.csv")
          .select(pl.col('text').str.replace(r'@\w+\b',''),
                  pl.col('airline_sentiment').alias("label"))
          .with_columns(pl.col("label").map_dict(mapping))
          .with_columns(pl.col("text").apply(spacy_preprocessing))
          .collect()
          )

CPU times: user 3.54 s, sys: 32.8 ms, total: 3.57 s
Wall time: 3.67 s


In [None]:
original_data['text'][2]

"@VirginAmerica I didn't today... Must mean I need to take another trip!"

In [None]:
data['text'][2]

"i did n't today must mean i need to take another trip !"

## Enter OpenAI Key

In [None]:
# Put down your API key here

openai.api_key = "<ENTER OPENAI API KEY>"

## Extract Embeddings

In [None]:
def get_embedding(text, model="text-embedding-ada-002"):
   text = text.replace("\n", " ")
   response=openai.Embedding.create(input = [text], model=model)
   return response['data'][0]['embedding']


In [None]:
data_emb =(
            data
            .select(pl.col('text'))
            .with_columns(pl.col("text").apply(get_embedding))
            .select(pl.col("text").reshape((data.shape[0], -1))
                    .arr.to_struct(n_field_strategy="max_width")
                    ).unnest("text")
          )

In [None]:
data_emb.shape

(14640, 1536)

In [None]:
data_emb.write_parquet("/content/drive/MyDrive/data/Tweets_openai_embeddings.parquet", compression="zstd")

## Modeling

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data_emb, data['label'], test_size=0.3, random_state=42)


In [None]:

d_train = xgboost.DMatrix(X_train.to_arrow(), y_train.to_arrow())
pred = xgboost.DMatrix(X_test.to_arrow())

params = {
    'objective': 'multi:softmax',   # Multi-class classification task (softmax output)
    'num_class': 3,                 # Number of classes (labels)
    'eval_metric': 'mlogloss',      # Multi-class logarithmic loss metric
    'eta': 0.1,                     # Learning rate (Typical values: 0.01 - 0.3)
    'max_depth': 6,                 # Maximum depth of a tree (Typical values: 3 - 10)
    'min_child_weight': 1,          # Minimum sum of instance weight needed in a child (Typical values: 1 - 10)
    'subsample': 0.8,               # Subsample ratio of the training instances (Typical values: 0.5 - 1.0)
    'colsample_bytree': 0.8,        # Subsample ratio of columns when constructing each tree (Typical values: 0.5 - 1.0)
    'gamma': 0.70,                     # Minimum loss reduction required to make a further partition on a leaf node
    'seed': 42,
    'alpha': 2e-05,
    'max_depth': 6                    # Random seed for reproducibility
}

In [None]:
# Watchlist to monitor the training and validation performance
watchlist = [(d_train, 'train')]

# Train the XGBoost model
num_rounds = 80  # Number of boosting rounds (epochs)
bst = xgboost.train(params, d_train, num_rounds, evals=watchlist, early_stopping_rounds=10, verbose_eval=10)

[0]	train-mlogloss:1.01618
[10]	train-mlogloss:0.56195
[20]	train-mlogloss:0.37269
[30]	train-mlogloss:0.26961
[40]	train-mlogloss:0.20441
[50]	train-mlogloss:0.16080
[60]	train-mlogloss:0.12928
[70]	train-mlogloss:0.10629
[79]	train-mlogloss:0.09052


## Evaluation

In [None]:
bst.predict(pred)

array([1., 0., 0., ..., 0., 0., 0.], dtype=float32)

In [None]:
y_pred = bst.predict(pred)
y_pred_prob = bst.predict(pred)
preds = pl.Series("preds", y_pred)
pred_probs = pl.Series("preds", y_pred_prob).to_numpy()

In [None]:

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.95      0.92      2814
           1       0.86      0.75      0.80       694
           2       0.75      0.64      0.69       884

    accuracy                           0.86      4392
   macro avg       0.83      0.78      0.80      4392
weighted avg       0.85      0.86      0.85      4392

