In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m62.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m24.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m92.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.29.2


## **Bert T-SNE**

In [3]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel
from sklearn.manifold import TSNE

# Define the paths to the train and test data files
train_data_path = '/content/drive/MyDrive/NEEWWWWW/DATA_NEW/PC/1000/train_800.csv'
test_data_path = '/content/drive/MyDrive/NEEWWWWW/DATA_NEW/PC/1000/test_200.csv'

# Load the train and test data into pandas dataframes
train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)

# Initialize the tokenizer and the BERT model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Tokenize the texts in the train and test datasets
train_encodings = tokenizer(list(train_data['text']), truncation=True, padding=True, max_length=16)
test_encodings = tokenizer(list(test_data['text']), truncation=True, padding=True, max_length=16)

# Get the first token embeddings from the BERT model's outputs for the train and test datasets
train_outputs = model(torch.tensor(train_encodings['input_ids']), attention_mask=torch.tensor(train_encodings['attention_mask']))
train_embeddings = train_outputs[0][:, 0, :].detach().cpu().numpy()

test_outputs = model(torch.tensor(test_encodings['input_ids']), attention_mask=torch.tensor(test_encodings['attention_mask']))
test_embeddings = test_outputs[0][:, 0, :].detach().cpu().numpy()

# Print the shapes of the embeddings before applying t-SNE
print(f'Train embeddings shape before t-SNE: {train_embeddings.shape}')  # should be (800, 768)
print(f'Test embeddings shape before t-SNE: {test_embeddings.shape}')  # should be (200, 768)

# # Apply t-SNE to reduce the dimension from 768 to 128
# tsne_model = TSNE(n_components=128, perplexity=30, learning_rate=200)
# train_embeddings_tsne = tsne_model.fit_transform(train_embeddings)
# test_embeddings_tsne = tsne_model.fit_transform(test_embeddings)

# Use t-SNE to reduce the dimensionality of the embeddings
tsne = TSNE(n_components=128, perplexity=30, method='exact')
train_embeddings_tsne = tsne.fit_transform(train_embeddings)
test_embeddings_tsne = tsne.fit_transform(test_embeddings)

# Print the shapes of the embeddings after applying t-SNE
print(f'Train embeddings shape after t-SNE: {train_embeddings_tsne.shape}')  # should be (800, 128)
print(f'Test embeddings shape after t-SNE: {test_embeddings_tsne.shape}')  # should be (200, 128)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Train embeddings shape before t-SNE: (800, 768)
Test embeddings shape before t-SNE: (200, 768)
Train embeddings shape after t-SNE: (800, 128)
Test embeddings shape after t-SNE: (200, 128)


## **Bert PCA**

In [3]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel
from sklearn.decomposition import PCA
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt


# Define the paths to the train and test data files
train_data_path = '/content/drive/MyDrive/NEEWWWWW/DATA_NEW/PC/1000/train_800.csv'
test_data_path = '/content/drive/MyDrive/NEEWWWWW/DATA_NEW/PC/1000/test_200.csv'

# Load the train and test data into pandas dataframes
train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)

# Initialize the tokenizer and the BERT model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Tokenize the texts in the train and test datasets
train_encodings = tokenizer(list(train_data['text']), truncation=True, padding=True, max_length=16)
test_encodings = tokenizer(list(test_data['text']), truncation=True, padding=True, max_length=16)

# Get the first token embeddings from the BERT model's outputs for the train and test datasets
train_outputs = model(torch.tensor(train_encodings['input_ids']), attention_mask=torch.tensor(train_encodings['attention_mask']))
train_embeddings = train_outputs[0][:, 0, :].detach().cpu().numpy()

test_outputs = model(torch.tensor(test_encodings['input_ids']), attention_mask=torch.tensor(test_encodings['attention_mask']))
test_embeddings = test_outputs[0][:, 0, :].detach().cpu().numpy()

# Print the shapes of the embeddings before applying PCA
print(f'Train embeddings shape before PCA: {train_embeddings.shape}')  # should be (800, 768)
print(f'Test embeddings shape before PCA: {test_embeddings.shape}')  # should be (200, 768)

# Apply PCA to reduce the dimension from 768 to 128
pca_model = PCA(n_components=128)
train_embeddings_pca = pca_model.fit_transform(train_embeddings)
test_embeddings_pca = pca_model.transform(test_embeddings)



# Print the shapes of the embeddings after applying PCA
print(f'Train embeddings shape after PCA: {train_embeddings_pca.shape}')  # should be (800, 128)
print(f'Test embeddings shape after PCA: {test_embeddings_pca.shape}')  # should be (200, 128)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Train embeddings shape before PCA: (800, 768)
Test embeddings shape before PCA: (200, 768)
Train embeddings shape after PCA: (800, 128)
Test embeddings shape after PCA: (200, 128)


## **W2V PCA**

In [3]:
import pandas as pd
import numpy as np
from gensim.models import KeyedVectors
from sklearn.decomposition import PCA

# Load the datasets
df_train = pd.read_csv("/content/drive/MyDrive/NEEWWWWW/DATA_NEW/PC/1000/train_800.csv")
df_test = pd.read_csv("/content/drive/MyDrive/NEEWWWWW/DATA_NEW/PC/1000/test_200.csv")

# Load the pretrained word2vec model
word2vec_model = '/content/drive/MyDrive/NEEWWWWW/GoogleNews-vectors-negative300.bin'
w2v = KeyedVectors.load_word2vec_format(word2vec_model, binary=True)

# Convert the sentences in the datasets to embeddings using the word2vec model
train_embeddings = np.array([np.mean([w2v[word] for word in sentence.split() if word in w2v]
                                     or [np.zeros(300)], axis=0) for sentence in df_train['text']])
test_embeddings = np.array([np.mean([w2v[word] for word in sentence.split() if word in w2v]
                                    or [np.zeros(300)], axis=0) for sentence in df_test['text']])

# Use PCA to reduce the dimensionality of the embeddings
pca = PCA(n_components=128)
train_embeddings_pca = pca.fit_transform(train_embeddings)
test_embeddings_pca = pca.transform(test_embeddings)

# Print the shape of the embeddings before and after PCA
print(f"Shape of embeddings before PCA: {train_embeddings.shape}")
print(f"Shape of embeddings after PCA: {train_embeddings_pca.shape}")

Shape of embeddings before PCA: (800, 300)
Shape of embeddings after PCA: (800, 128)


## **W2V T-SNE**

In [8]:
import pandas as pd
import numpy as np
from gensim.models import KeyedVectors
from sklearn.manifold import TSNE

# Load the datasets
df_train = pd.read_csv("/content/drive/MyDrive/NEEWWWWW/DATA_NEW/PC/1000/train_800.csv")
df_test = pd.read_csv("/content/drive/MyDrive/NEEWWWWW/DATA_NEW/PC/1000/test_200.csv")

# Load the pretrained word2vec model
word2vec_model = '/content/drive/MyDrive/NEEWWWWW/GoogleNews-vectors-negative300.bin'
w2v = KeyedVectors.load_word2vec_format(word2vec_model, binary=True)

# Convert the sentences in the datasets to embeddings using the word2vec model
train_embeddings = np.array([np.mean([w2v[word] for word in sentence.split() if word in w2v]
                                     or [np.zeros(300)], axis=0) for sentence in df_train['text']])
test_embeddings = np.array([np.mean([w2v[word] for word in sentence.split() if word in w2v]
                                    or [np.zeros(300)], axis=0) for sentence in df_test['text']])

# Use t-SNE to reduce the dimensionality of the embeddings
tsne = TSNE(n_components=128, perplexity=30, method='exact')
train_embeddings_tsne = tsne.fit_transform(train_embeddings)
test_embeddings_tsne = tsne.fit_transform(test_embeddings)

# Print the dimension size before and after t-SNE
print(f"Dimension size before t-SNE: {train_embeddings.shape}")
print(f"Dimension size after t-SNE: {train_embeddings_tsne.shape}")

Dimension size before t-SNE: (800, 300)
Dimension size after t-SNE: (800, 128)
