##### installing packages

In [5]:
!pip install transformers torch evaluate tqdm dataset

Collecting evaluate
  Using cached evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dataset
  Downloading dataset-1.6.2-py2.py3-none-any.whl.metadata (1.9 kB)
Collecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.9-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from evaluate)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.17-py311-none-any.whl.metadata (7.2 kB)
Collecting sqlalchemy<2.0.0,>=1.3.2 (from dataset)
  Downloading SQLAlchemy-1.4.54-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Collecting alembic>=0.6.2 (from dataset)
  Downloading alembic-1.14.1-py3-none-any.whl.metadata (7.4 kB)
Collecting banal>=1.0.1 (from dataset)
  Downloading banal-1.0.6-py2.py3-none-an

#### Loading packages

In [6]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import re
import pandas as pd
import string
import numpy as np
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

import torch, codecs, random
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
from evaluate import load as load_metric
from google.colab import output
import matplotlib.pyplot as plt
from typing import List, Dict, Any
from transformers import PreTrainedTokenizer
from torch.optim import Optimizer

output.enable_custom_widget_manager()

from google.colab import drive
drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


## Data Preprocessing

In [8]:
df = pd.read_csv("/content/drive/MyDrive/Tweets_5K.csv")

df.head()

Unnamed: 0,textID,text,sentiment
0,cb774db0d1,"I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,negative
2,088c60f138,my boss is bullying me...,negative
3,9642c003ef,what interview! leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...",negative


In [9]:
df['sentiment'].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
neutral,2023
positive,1592
negative,1385


In [11]:
df['sentiment'] = df['sentiment'].replace({'neutral': 0,
                                           'positive': 1,
                                           'negative': -1})

raw_tweets = df['text'].to_list()
labels = df['sentiment'].to_list()


# X = dataset['text'].to_list()
# y = dataset['sentiment'].map({'negative': 0, 'neutral': 1, 'positive': 2}).to_list()

In [17]:
#Featurize - bag of words

tweet_dicts = [dict(Counter(tweet)) for tweet in basic_preproc_tweets]
vectorizer = DictVectorizer(sparse=False)
X = vectorizer.fit_transform(tweet_dicts)
vocabulary = vectorizer.get_feature_names_out()

print("Vocabulary:\n", vocabulary)

Vocabulary:
 ['!' '!!' '!!!' ... 'ï¿½4.80' 'ï¿½anisalovesu' 'ï¿½you']


In [18]:
df_bow = pd.DataFrame(X, columns=vocabulary)
df_bow.head(5)

Unnamed: 0,!,!!,!!!,!!!!,!!!!!,!!!!!!!!,!!my,!+,!1!!,#,...,zombies?',{this,{{{HUGS}}},~,ï¿½,ï¿½1.50,ï¿½2,ï¿½4.80,ï¿½anisalovesu,ï¿½you
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Train Test Split

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X,labels,test_size = 0.2, random_state=42)

X_train.shape

(4000, 15121)