In [1]:
import pandas as pd
import nltk
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec 
from transformers import T5Tokenizer, TFAutoModelForSequenceClassification

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
data = pd.read_csv('/kaggle/input/sentiment140/training.1600000.processed.noemoticon.csv', encoding='latin1')
data.columns=['sentiment','id','datetime','flag','user','text']
data.head()

Unnamed: 0,sentiment,id,datetime,flag,user,text
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [3]:
data.tail()

Unnamed: 0,sentiment,id,datetime,flag,user,text
1599994,4,2193601966,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,AmandaMarie1028,Just woke up. Having no school is the best fee...
1599995,4,2193601969,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,TheWDBoards,TheWDB.com - Very cool to hear old Walt interv...
1599996,4,2193601991,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,bpbabe,Are you ready for your MoJo Makeover? Ask me f...
1599997,4,2193602064,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! ...
1599998,4,2193602129,Tue Jun 16 08:40:50 PDT 2009,NO_QUERY,RyanTrevMorris,happy #charitytuesday @theNSPCC @SparksCharity...


In [6]:
text = data["text"]  
sentiment = data["sentiment"]  
stop_words = stopwords.words("english")
def preprocess_text(text):
  text = text.lower()
  tokens = word_tokenize(text)
  filtered_tokens = [token for token in tokens if token not in stop_words]
  return " ".join(filtered_tokens)
text = text.apply(preprocess_text)
def create_tfidf_features(text):
  vectorizer = TfidfVectorizer()
  features = vectorizer.fit_transform(text)
  return features
def create_cbow_features(text, window_size=2, vector_dim=100):
  model = Word2Vec(sentences=[text.split() for text in text], window=window_size, min_count=1, vector_size=vector_dim)
  features = []
  for doc in text.split():
    word_vectors = []
    for word in doc.split():
      if word in model.wv:
        word_vectors.append(model.wv[word])
    if word_vectors:
      features.append(sum(word_vectors) / len(word_vectors))
    else:
      features.append(np.zeros(vector_dim))  
  return np.array(features)
tfidf_features = create_tfidf_features(text)
cbow_features = create_cbow_features(text)
word_embeddings = gensim.downloader.load("glove-twitter-100")  
def get_word_embeddings(text, word_embeddings, embedding_dim):
  embeddings = np.zeros((len(text.split()), embedding_dim))
  for i, word in enumerate(text.split()):
    if word in word_embeddings.vocab:
      embeddings[i] = word_embeddings[word]
    else:
      embeddings[i] = np.zeros(embedding_dim)  
  return embeddings
def concatenate_features(text, word_embeddings, tfidf_features, cbow_features):
  embedding_dim = word_embeddings.vector_dim
  word_embeddings_list = [get_word_embeddings(doc, word_embeddings, embedding_dim) for doc in text]
  all_features = np.concatenate((np.array(word_embeddings_list), tfidf_features, cbow_features), axis=1)
  return all_features

all_features_train = concatenate_features(text.tolist(), word_embeddings, tfidf_features, cbow_features)
X_train, X_test, y_train, y_test = train_test_split(all_features_train, sentiment, test_size=0.2)
model_name = "t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = TFAutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(set(sentiment)))
def preprocess_text_for_t5(text):
  text = text.lower()
  tokens = word_tokenize(text)
  lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
  text = " ".join(lemmatized_tokens)
  encoded_text = tokenizer(text, padding="max_length", truncation=True, return_tensors="tf")
  return encoded_text["input_ids"], encoded_text["attention_mask"]
text_train_t5, attention_masks_train_t5 = text.apply(lambda t: preprocess_text_for_t5(t)).tolist()
text_test_t5, attention_masks_test_t5 = text.apply(lambda t: preprocess_text_for_t5(t)).tolist()
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
model.fit([text_train_t5, attention_masks_train_t5], y_train, epochs=3, batch_size=16)  
predictions = model.predict([X_test, attention_masks_test_t5]).argmax(axis=1)
accuracy = accuracy_score(y_test, predictions)
print(f"T5 Model Accuracy: {accuracy:.4f}")

KeyboardInterrupt: 