In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from gensim.models import Word2Vec

In [None]:
import pandas as pd

splits = {'train': 'data/train-00000-of-00001.parquet', 'validation': 'data/validation-00000-of-00001.parquet', 'test': 'data/test-00000-of-00001.parquet'}
df = pd.read_parquet("hf://datasets/stanfordnlp/sst2/" + splits["train"])

df.head()

train_dataset, test_dataset = train_test_split(df, test_size=0.2, random_state=42)


print("Train dataset:")

train_dataset


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Train dataset:


Unnamed: 0,idx,sentence,label
65374,65374,lacks the inspiration of the original and has ...,0
14599,14599,"is also a film of freshness , imagination and ...",1
46387,46387,far more alienating than involving,0
27872,27872,the creative animation work may not look as fu...,0
40668,40668,"make for a winning , heartwarming yarn .",1
...,...,...,...
37194,37194,its provocative conclusion,1
6265,6265,an action film disguised as a war tribute is d...,0
54886,54886,goes to absurd lengths to duck the very issues...,0
860,860,a perfect performance,1


In [None]:
print("Test dataset:")

test_dataset

Test dataset:


Unnamed: 0,idx,sentence,label
66730,66730,with outtakes in which most of the characters ...,0
29890,29890,enigma is well-made,1
45801,45801,is ) so stoked to make an important film about...,0
29352,29352,the closest thing to the experience of space t...,1
19858,19858,lose their luster,0
...,...,...,...
34702,34702,makes a nice album,1
23754,23754,sent back,0
58408,58408,do n't think so,0
5637,5637,works beautifully as a movie without sacrifici...,1


In [None]:
# Bag of Words representation

# using CountVectorizer(): a method that transforms the text into a vector based on the frequency - (count) of each word that occurs

vectorizer_bow = CountVectorizer()



print("Bag of Words representation:")


X_train_bow = vectorizer_bow.fit_transform(train_dataset['sentence'])

# first 20 words in the vocabulary
print(vectorizer_bow.get_feature_names_out()[6000:6020])


print("First review of a film in the train dataset:")
print(X_train_bow[0])
# (0, 13441) - index 0 of the sentence and 13441 index of the word in the vocabulary

print("\n BOW vector for the first review:")
print(X_train_bow[0].toarray())


# Mapping indexes to words in the review
feature_names = vectorizer_bow.get_feature_names_out()
words_in_first_review = [feature_names[i] for i in X_train_bow[0].indices]
frequencies_in_first_review = X_train_bow[0].data

print("\n Words and their frequencies in the first review:")
for word, freq in zip(words_in_first_review, frequencies_in_first_review):
    print(f"{word}: {freq}")

X_test_bow = vectorizer_bow.transform(test_dataset['sentence'])

Bag of Words representation:
['illumination' 'illusion' 'illustrated' 'illustrates' 'illustrating'
 'image' 'imagery' 'images' 'imaginable' 'imaginary' 'imagination'
 'imaginative' 'imaginatively' 'imagine' 'imagined' 'imagines' 'imagining'
 'imamura' 'imax' 'imbecilic']
First review of a film in the train dataset:
  (0, 6836)	1
  (0, 12119)	3
  (0, 6301)	1
  (0, 8300)	1
  (0, 8413)	1
  (0, 566)	2
  (0, 5565)	1
  (0, 1322)	1
  (0, 8996)	1
  (0, 12118)	1
  (0, 11580)	1
  (0, 10263)	1
  (0, 12251)	1
  (0, 145)	1
  (0, 1)	1
  (0, 7703)	1
  (0, 8671)	1
  (0, 2037)	1
  (0, 6360)	1
  (0, 546)	1
  (0, 312)	1
  (0, 8690)	1

 BOW vector for the first review:
[[0 1 0 ... 0 0 0]]

 Words and their frequencies in the first review:
lacks: 1
the: 3
inspiration: 1
of: 1
original: 1
and: 2
has: 1
bloated: 1
plot: 1
that: 1
stretches: 1
running: 1
time: 1
about: 1
10: 1
minutes: 1
past: 1
child: 1
interest: 1
an: 1
adult: 1
patience: 1


In [None]:
# TF-IDF representation

vectorizer_tfidf = TfidfVectorizer()

print("TF-IDF representation:")

X_train_tfidf = vectorizer_tfidf.fit_transform(train_dataset['sentence'])

#print(X_tfidf)

# for the first sample print the review + TF-IDF representation

print(train_dataset['sentence'])

print(X_train_tfidf[0])


print(X_train_tfidf.shape)

first_review_tfidf = X_train_tfidf[0]

feature_names = vectorizer_tfidf.get_feature_names_out()
words_in_first_review = [feature_names[i] for i in first_review_tfidf.indices]
tfidf_scores_in_first_review = first_review_tfidf.data

print("\n Words and their scores in first review:")
for word, score in zip(words_in_first_review, tfidf_scores_in_first_review):
    print(f"{word}: {score}")

print("\n Test dataset")

X_test_tfidf = vectorizer_tfidf.transform(test_dataset['sentence'])

print(X_test_tfidf)

print(X_test_tfidf.shape)



##############################################################################################
print("\n --------------------------------------------")
# Calculating the mean tf-idf score for train dataset
mean_tfidf_scores = X_train_tfidf.mean(axis=1).A1

# Sort the sentences after the mean score
sorted_indices_by_tfidf = np.argsort(mean_tfidf_scores)[::-1]

# Get the sorted sentences
sorted_X_train_by_tfidf = train_dataset.iloc[sorted_indices_by_tfidf].reset_index(drop=True)

print("Sorted documents:")

for i in range(10):
    print(f"Document {i+1}:")
    print(sorted_X_train_by_tfidf.iloc[i])  # text
    print(f"Label: {sorted_X_train_by_tfidf['label'].iloc[i]}")  # label
    print(f"Mean TF-IDF score: {mean_tfidf_scores[sorted_indices_by_tfidf[i]]}")  # mean TF-IDF score
    print("\n")




TF-IDF representation:
65374    lacks the inspiration of the original and has ...
14599    is also a film of freshness , imagination and ...
46387                  far more alienating than involving 
27872    the creative animation work may not look as fu...
40668            make for a winning , heartwarming yarn . 
                               ...                        
37194                          its provocative conclusion 
6265     an action film disguised as a war tribute is d...
54886    goes to absurd lengths to duck the very issues...
860                                 a perfect performance 
15795       constantly pulling the rug from underneath us 
Name: sentence, Length: 53879, dtype: object
  (0, 8690)	0.26820063512803094
  (0, 312)	0.26049882965499416
  (0, 546)	0.12210164046507045
  (0, 6360)	0.21879792952758437
  (0, 2037)	0.25690670757830447
  (0, 8671)	0.22297423692656196
  (0, 7703)	0.19676461248346275
  (0, 1)	0.23992049962692388
  (0, 145)	0.1479697726797965
  

In [None]:
# Word2Vec representation

sentences = [sentence.split() for sentence in train_dataset['sentence']]

model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

# function that receives the token representation of words in our vocabulary and model

def get_avg_word2vec(tokens, model):

    vecs = [model.wv[word] for word in tokens if word in model.wv]
    return np.mean(vecs, axis=0) if vecs else np.zeros(100)

X_train_word2vec = np.array([get_avg_word2vec(tokens, model) for tokens in sentences])

print("Word2Vec representation:")

print(X_train_word2vec[0])

print(X_train_word2vec.shape)

X_test_word2vec = np.array([get_avg_word2vec(tokens, model) for tokens in test_dataset['sentence']])

print("Test dataset")

print(X_test_word2vec)

# use skip-gram w2v

model_sg = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4, sg=1)

X_train_w2v = np.array([get_avg_word2vec(tokens, model_sg) for tokens in sentences])

print("Word2Vec representation:")

print(X_train_w2v[0])

print(X_train_w2v.shape)

X_test_w2v = np.array([get_avg_word2vec(tokens, model_sg) for tokens in test_dataset['sentence']])

print("Test dataset")

print(X_test_w2v)




Word2Vec representation:
[ 2.82628201e-02  4.20010030e-01 -1.57125652e-01  2.38334000e-01
  5.55358455e-02 -7.43724585e-01  2.79231310e-01  8.10641825e-01
 -2.56153524e-01 -4.18405741e-01  1.32287830e-01 -6.09710813e-01
  8.53515342e-02  1.99555367e-01 -1.20504692e-01 -3.84209096e-01
  5.95132113e-01 -1.74518272e-01 -5.51213145e-01 -7.58663416e-01
  3.65022659e-01  4.54546422e-01  4.28041846e-01 -4.15758848e-01
  1.18964329e-01  1.12690538e-01 -3.67494226e-01  3.67028937e-02
 -5.71925044e-01  6.82434067e-02  1.45016760e-01  9.94308218e-02
  2.07243979e-01 -6.34631038e-01 -3.34113866e-01  2.81906188e-01
  7.13496730e-02  5.90516590e-02 -1.97743744e-01 -4.37382132e-01
  6.90085143e-02 -3.20872128e-01 -2.61156976e-01  1.28242120e-01
  2.66121477e-01 -4.76286113e-02 -7.00302005e-01 -1.23542309e-01
  9.07711834e-02  4.20856237e-01  1.50982395e-01 -1.60255298e-01
 -2.34907463e-01 -3.21171790e-01 -1.70740739e-01  3.26354131e-02
 -4.43009436e-02 -1.35166749e-01 -4.58806932e-01  3.16950351e-01


In [None]:
def train_and_evaluate(model, X_train, y_train, X_test, y_test):
    # Train the model
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    macro_score = f1_score(y_test, y_pred, average='macro')
    micro_score = f1_score(y_test, y_pred, average='micro')
    weighted_score = f1_score(y_test, y_pred, average='weighted')

    return accuracy, macro_score, micro_score, weighted_score


#model1 = LogisticRegression(max_iter=200)

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler(with_mean=False)

X_train_scaled = scaler.fit_transform(X_train_bow)
X_test_scaled = scaler.transform(X_test_bow)

model1 = LogisticRegression(solver='saga', max_iter=250)
model2 = SVC(kernel='linear', C=10, random_state=42)

print("For Logistic Regression model")


print ("BOW metrics ")
accuracy, macro_f1, micro_f1, weighted_f1 = train_and_evaluate(model1, X_train_bow, train_dataset['label'], X_test_bow, test_dataset['label'])
print("BOW Accuracy:", accuracy)
print("BOW Macro F1:", macro_f1)
print("BOW Micro F1:", micro_f1)
print("BOW Weighted F1:", weighted_f1)


print ("\n TF-IDF metrics")
accuracy, macro_f1, micro_f1, weighted_f1 = train_and_evaluate(model1, X_train_tfidf, train_dataset['label'], X_test_tfidf, test_dataset['label'])
print("TF-IDF Accuracy:", accuracy)
print("TF-IDF  Macro F1:", macro_f1)
print("TF-IDF  Micro F1:", micro_f1)
print("TF-IDF  Weighted F1:", weighted_f1)


print ("\n Word2Vec metrics")

accuracy, macro_f1, micro_f1, weighted_f1 = train_and_evaluate(model1, X_train_word2vec, train_dataset['label'], X_test_word2vec, test_dataset['label'])
print("Word2Vec Accuracy:", accuracy)
print("Word2Vec Macro F1:", macro_f1)
print("Word2Vec Micro F1:", micro_f1)
print("Word2Vec Weighted F1:", weighted_f1)

# test skip-gram representation

accuracy, macro_f1, micro_f1, weighted_f1 = train_and_evaluate(model1, X_train_w2v, train_dataset['label'], X_test_w2v, test_dataset['label'])
print("Skip-gram Accuracy:", accuracy)
print("Skip-gram Macro F1:", macro_f1)
print("Skip-gram Micro F1:", micro_f1)
print("Skip-gram Weighted F1:", weighted_f1)



print ("\n Apply model on scaled data")

accuracy, macro_f1, micro_f1, weighted_f1 = train_and_evaluate(model1, X_train_scaled, train_dataset['label'], X_test_scaled, test_dataset['label'])
print("Scaled Accuracy:", accuracy)
print("Scaled Macro F1:", macro_f1)
print("Scaled Micro F1:", micro_f1)
print("Scaled Weighted F1:", weighted_f1)









For Logistic Regression model
BOW metrics 
BOW Accuracy: 0.8930215293244247
BOW Macro F1: 0.8911397386870734
BOW Micro F1: 0.8930215293244247
BOW Weighted F1: 0.8928950848942467

 TF-IDF metrics
TF-IDF Accuracy: 0.8853749072011878
TF-IDF  Macro F1: 0.8835240347187276
TF-IDF  Micro F1: 0.8853749072011878
TF-IDF  Weighted F1: 0.8853247657794017

 Word2Vec metrics




Word2Vec Accuracy: 0.43919821826280625
Word2Vec Macro F1: 0.3083937972197959
Word2Vec Micro F1: 0.43919821826280625
Word2Vec Weighted F1: 0.2715059680011115
Skip-gram Accuracy: 0.43912397921306606
Skip-gram Macro F1: 0.3093455278781115
Skip-gram Micro F1: 0.43912397921306606
Skip-gram Weighted F1: 0.27262793897498677

 Apply model on scaled data
Scaled Accuracy: 0.9024498886414254
Scaled Macro F1: 0.901218382407159
Scaled Micro F1: 0.9024498886414254
Scaled Weighted F1: 0.9025710740953291




In [None]:

print ("\n --------------------------------------------")
print("For SVC model")

model2 = SVC(kernel='linear', C=5)

print ("BOW")

labels_train = np.array(train_dataset['label'])
labels_test = np.array(test_dataset['label'])


model2.fit(X_train_bow, labels_train)
y_pred = model2.predict(X_test_bow)
accuracy = accuracy_score(labels_test, y_pred)
f1_macro = f1_score(labels_test, y_pred, average='macro')
f1_micro = f1_score(labels_test, y_pred, average='micro')
f1_weighted = f1_score(labels_test, y_pred, average='weighted')

print("BOW Accuracy:", accuracy)
print("BOW Macro F1:", f1_macro)
print("BOW Micro F1:", f1_micro)
print("BOW Weighted F1:", f1_weighted)



 --------------------------------------------
For SVC model
BOW
BOW Accuracy: 0.9034149962880476
BOW Macro F1: 0.9017586980616235
BOW Micro F1: 0.9034149962880476
BOW Weighted F1: 0.9033231375527914


In [None]:
# Random Forest

from sklearn.ensemble import RandomForestClassifier

model3 = RandomForestClassifier(n_estimators=50, random_state=42)

# use BOW representation

model3.fit(X_train_bow, labels_train)

# Evaluate the model on the test set
accuracy = model3.score(X_test_bow, labels_test)

print("Accuracy:", accuracy)

# use TF-IDF representation
model3.fit(X_train_tfidf, labels_train)

# Evaluate the model on the test set
accuracy = model3.score(X_test_tfidf, labels_test)

print("Accuracy:", accuracy)





Accuracy: 0.8993318485523385
Accuracy: 0.894580549368968


In [None]:
# FastText embeddings


!pip install pandas numpy scikit-learn gensim nltk




In [None]:
import nltk

nltk.download('punkt')
from gensim.models import FastText
from nltk.tokenize import word_tokenize

print("TRAIN")

print(train_dataset['sentence'][0])

contents_train_tokenized = [word_tokenize(text) for text in train_dataset['sentence']]

print(contents_train_tokenized[0])

print("TEST")

print(test_dataset['sentence'][66730])

contents_test_tokenized = [word_tokenize(text) for text in test_dataset['sentence']]

print(contents_test_tokenized[0])

fasttext_model = FastText(contents_train_tokenized, vector_size=100, window=5, min_count=1, workers=4)



# Generate embeddings for training

def generate_embeddings(contents_tokenized):
    embeddings = []
    for text in contents_tokenized:
     # Calculate the mean of word vectors for each text

        text_embedding = np.mean([fasttext_model.wv[word] for word in text if word in fasttext_model.wv] or [np.zeros(300)], axis=0)
        embeddings.append(text_embedding)

    embeddings = np.array(embeddings)
    return embeddings


embeddings_train = generate_embeddings(contents_train_tokenized)

embeddings_test = generate_embeddings(contents_test_tokenized)

print(embeddings_train.shape)

print(embeddings_test.shape)


model2.fit(embeddings_train, labels_train)

# Evaluate the model on the test set
accuracy = model2.score(embeddings_test, labels_test)
print("Accuracy:", accuracy)







[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


TRAIN
hide new secretions from the parental units 
['lacks', 'the', 'inspiration', 'of', 'the', 'original', 'and', 'has', 'a', 'bloated', 'plot', 'that', 'stretches', 'the', 'running', 'time', 'about', '10', 'minutes', 'past', 'a', 'child', "'s", 'interest', 'and', 'an', 'adult', "'s", 'patience']
TEST
with outtakes in which most of the characters forget their lines and just utter ` uhhh , ' which is better than most of the writing in the movie 
['with', 'outtakes', 'in', 'which', 'most', 'of', 'the', 'characters', 'forget', 'their', 'lines', 'and', 'just', 'utter', '`', 'uhhh', ',', "'", 'which', 'is', 'better', 'than', 'most', 'of', 'the', 'writing', 'in', 'the', 'movie']
(53879, 100)
(13470, 100)
Accuracy: 0.674090571640683
