In [190]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():

    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [191]:
import sys; sys.path.append('../..') ; sys.path.append('..') ;
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
# dummy trainloader
trainloader = data.DataLoader(data.TensorDataset(torch.Tensor(1), torch.Tensor(1)), batch_size=1, shuffle=True)
import matplotlib.pyplot as plt

In [192]:
import numpy as np
import pandas as pd
import random
seed = 0
np.random.seed(seed)
torch.random.manual_seed(seed)
random.seed(seed)

In [193]:
reviews_df = pd.read_csv('/content/final_data.csv')
# Report the number of sentences.
print('Number of training sentences: {:,}\n'.format(reviews_df.shape[0]))

# Display 10 random rows from the data.
reviews_df.sample(10)

Number of training sentences: 16,961



Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,category,rating,label,text_,sentiment,semantic_relevance,word_count,character_count,...,PC31,PC32,PC33,PC34,PC35,PC36,PC37,PC38,PC39,PC40
12973,12973,12973,Movies_and_TV_5,5.0,OR,needed to update from just dvd to blu-ray/dvd ...,Positive,High Relevance,24,130,...,0.090147,0.385865,0.57322,-0.48851,0.45613,-0.13586,0.235995,-1.045024,0.64408,-0.140371
11832,11832,11832,Electronics_5,3.0,CG,i like the ssd really well. the only problem i...,Positive,Medium Relevance,227,1043,...,0.050373,0.758291,1.707817,-1.478348,-2.365881,0.106346,-0.316299,-0.31318,2.369433,-0.920563
12,12,12,Home_and_Kitchen_5,5.0,CG,"did someone say, ""oriental for $60""? it is a ...",Positive,High Relevance,13,67,...,0.076484,-0.071074,0.51218,-0.750931,0.037356,0.413553,0.46818,-0.032393,0.094953,0.3362
14903,14903,14903,Movies_and_TV_5,4.0,OR,the movie is hard to follow. there are a lot ...,Positive,High Relevance,145,782,...,-0.942451,-1.144385,-0.884996,-0.564732,-0.40529,-0.754169,-0.339456,0.888037,-0.572727,-1.100155
5607,5607,5607,Sports_and_Outdoors_5,5.0,CG,lightweight and strong very well built. the on...,Positive,High Relevance,28,138,...,0.080337,-0.187978,0.207114,0.040186,0.160059,-0.058,-0.016345,-0.33179,0.342271,-0.259911
14237,14237,14237,Movies_and_TV_5,4.0,CG,i would like to have seen more of the movie. i...,Positive,High Relevance,59,275,...,-0.407604,0.196164,-0.809012,-0.067991,-0.365835,-0.551282,-0.158564,-0.187496,-0.361591,0.324024
11362,11362,11362,Electronics_5,5.0,CG,love this backpack! it fits my taurus and my ...,Positive,High Relevance,110,568,...,-0.588924,1.131728,-0.275148,0.047353,-1.0653,0.407195,0.343891,0.74831,-0.416374,0.234984
15471,15471,15471,Movies_and_TV_5,2.0,OR,when mark wahlberg openly implies the mistake ...,Positive,Low Relevance,302,1712,...,-0.157043,-1.617708,0.384038,0.600083,2.696927,1.443746,1.256126,-1.509512,0.724717,1.78147
4097,4097,4097,Sports_and_Outdoors_5,5.0,CG,my husband loves these shoes. they have the wi...,Positive,High Relevance,19,97,...,0.135535,-0.798277,0.907635,-0.150496,-0.045715,-0.452859,0.055879,-0.013852,0.029692,-0.583037
7678,7678,7678,Sports_and_Outdoors_5,3.0,OR,i also did not have the easiest time getting t...,Positive,Medium Relevance,200,982,...,-1.293616,-1.016999,0.60027,-2.563426,-2.984341,2.011421,1.180547,-0.585169,0.401979,-1.648251


In [194]:
from sklearn.preprocessing import LabelEncoder

# Label encoding for labels
encoder = LabelEncoder()
encoder.fit(reviews_df['label'])
reviews_df['label'] = encoder.transform(reviews_df['label'])

# Select only categorical features
categorical_features = reviews_df[['category', 'sentiment', 'semantic_relevance']]


In [195]:
#one hot encoding
one_hot_encoded = pd.get_dummies(categorical_features, prefix=['category', 'sentiment', 'semantic_relevance'])
pca_columns = reviews_df.filter(regex='^PC\d+$')
pc_columns = [f'PC{i}' for i in range(1, 41)]
pca_columns[pc_columns] = pca_columns[pc_columns].apply(pd.to_numeric, errors='coerce')
pca_columns[pc_columns] = pca_columns[pc_columns].fillna(pca_columns[pc_columns].mean())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pca_columns[pc_columns] = pca_columns[pc_columns].apply(pd.to_numeric, errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pca_columns[pc_columns] = pca_columns[pc_columns].fillna(pca_columns[pc_columns].mean())


In [196]:
df_with_PCA = pd.concat([one_hot_encoded, pca_columns, reviews_df['label']], axis=1)
df_with_PCA.head()

Unnamed: 0,category_Electronics_5,category_Home_and_Kitchen_5,category_Movies_and_TV_5,category_Sports_and_Outdoors_5,category_Tools_and_Home_Improvement_5,sentiment_Negative,sentiment_Neutral,sentiment_Positive,semantic_relevance_High Relevance,semantic_relevance_Low Relevance,...,PC32,PC33,PC34,PC35,PC36,PC37,PC38,PC39,PC40,label
0,0,1,0,0,0,0,0,1,1,0,...,0.006401,0.280228,-0.287937,0.88216,0.046707,0.187317,0.089769,0.2616,0.278248,0
1,0,1,0,0,0,0,0,1,1,0,...,-0.381576,0.380889,-0.089709,0.582129,0.311664,0.28295,-0.028493,0.212799,0.178306,0
2,0,1,0,0,0,0,0,1,1,0,...,-0.191953,0.527067,-0.304462,0.019836,-0.148001,0.074558,-0.003148,0.045887,-0.008477,0
3,0,1,0,0,0,0,0,1,0,1,...,-0.030093,0.309017,0.568731,0.121778,-0.395875,0.42316,-0.276112,0.481892,-0.182366,0
4,0,1,0,0,0,0,0,1,1,0,...,-0.278657,0.195943,0.02807,0.868029,0.114072,-0.0189,-0.605967,0.115912,0.362586,0


In [197]:
SEED = 0
train = df_with_PCA.sample(frac=0.8, random_state = SEED)
test = df_with_PCA.drop(train.index)

In [198]:
X_train = train.drop('label', axis=1)  # Features (excluding the 'label' column)
y_train = train['label']
X_test= test.drop('label', axis=1)
y_test= test['label']
X_train.head()

Unnamed: 0,category_Electronics_5,category_Home_and_Kitchen_5,category_Movies_and_TV_5,category_Sports_and_Outdoors_5,category_Tools_and_Home_Improvement_5,sentiment_Negative,sentiment_Neutral,sentiment_Positive,semantic_relevance_High Relevance,semantic_relevance_Low Relevance,...,PC31,PC32,PC33,PC34,PC35,PC36,PC37,PC38,PC39,PC40
12973,0,0,1,0,0,0,0,1,1,0,...,0.090147,0.385865,0.57322,-0.48851,0.45613,-0.13586,0.235995,-1.045024,0.64408,-0.140371
11832,1,0,0,0,0,0,0,1,0,0,...,0.050373,0.758291,1.707817,-1.478348,-2.365881,0.106346,-0.316299,-0.31318,2.369433,-0.920563
12,0,1,0,0,0,0,0,1,1,0,...,0.076484,-0.071074,0.51218,-0.750931,0.037356,0.413553,0.46818,-0.032393,0.094953,0.3362
14903,0,0,1,0,0,0,0,1,1,0,...,-0.942451,-1.144385,-0.884996,-0.564732,-0.40529,-0.754169,-0.339456,0.888037,-0.572727,-1.100155
5607,0,0,0,1,0,0,0,1,1,0,...,0.080337,-0.187978,0.207114,0.040186,0.160059,-0.058,-0.016345,-0.33179,0.342271,-0.259911


In [199]:
y_train.head()

12973    1
11832    0
12       0
14903    1
5607     0
Name: label, dtype: int64

In [200]:
class Model(nn.Module):
    def __init__(self,device="cpu"):
        super(Model, self).__init__()
        self.linear1 = nn.Linear(51,48)
        self.linear2 = nn.Linear(48,42)
        self.linear3 = nn.Linear(42,36)
        self.linear4 = nn.Linear(36,28)
        self.linear5 = nn.Linear(28,20)
        self.linear6 = nn.Linear(20,16)
        self.linear7 = nn.Linear(16,12)
        self.linear8 = nn.Linear(12,8)
        self.linear9 = nn.Linear(8,4)
        self.linear10 = nn.Linear(4,1)
    def forward(self, x):
        y = torch.sigmoid(self.linear1(x))
        y = torch.sigmoid(self.linear2(y))
        y = torch.sigmoid(self.linear3(y))
        y = torch.sigmoid(self.linear4(y))
        y = torch.sigmoid(self.linear5(y))
        y = torch.sigmoid(self.linear6(y))
        y = torch.sigmoid(self.linear7(y))
        y = torch.sigmoid(self.linear8(y))
        y = torch.sigmoid(self.linear9(y))
        y = torch.sigmoid(self.linear10(y))
        return y

In [201]:
model = Model().to(device)
for name, param in model.named_parameters():
    print(f"Parameter name: {name}")
    print(f"Parameter value: {param.data}")
    print("=" * 30)

Parameter name: linear1.weight
Parameter value: tensor([[-0.0010,  0.0751, -0.1152,  ..., -0.0621, -0.0051,  0.0896],
        [ 0.1392,  0.0556,  0.0189,  ..., -0.0806,  0.1317,  0.0944],
        [-0.0611, -0.0352, -0.1334,  ..., -0.1310, -0.1182, -0.0284],
        ...,
        [-0.0125,  0.0207,  0.0513,  ...,  0.1357, -0.0040,  0.1223],
        [ 0.0187, -0.0521,  0.1027,  ..., -0.0282, -0.1376,  0.1351],
        [ 0.0681, -0.0838, -0.1022,  ...,  0.1348, -0.0376, -0.0189]],
       device='cuda:0')
Parameter name: linear1.bias
Parameter value: tensor([-0.0858, -0.0694,  0.0308,  0.0590, -0.0554, -0.0515,  0.1109, -0.0721,
        -0.1045, -0.0218,  0.1332,  0.0305,  0.0193, -0.0174,  0.0875, -0.0694,
         0.0949,  0.0331,  0.0156,  0.0193, -0.0864,  0.1040,  0.0735, -0.1095,
        -0.1304, -0.0360,  0.0930, -0.0689,  0.1298, -0.0704, -0.1265,  0.0616,
        -0.0846, -0.0221, -0.0437, -0.0111, -0.1220, -0.1284,  0.0602, -0.0159,
        -0.0845, -0.0036, -0.1278,  0.0372,  0.0

In [202]:
optimizer = optim.AdamW(model.parameters(), lr=1e-2, weight_decay=1e-6)

In [203]:
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)
X_train_tensor = X_train_tensor.to(device)
y_train_tensor = y_train_tensor.to(device)

In [204]:
for itr in range(1, 20001):
    optimizer.zero_grad()
    # x0, y = sample_annuli(device=device, n_samples=1024)
    yh = model(X_train_tensor)
    print(yh.shape)
    loss = nn.BCELoss()(yh.squeeze(), y_train_tensor.float())
    print(itr,loss.item())
    #loss_traj[itr-1] = loss.item()
    loss.backward()
    optimizer.step()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
torch.Size([13569, 1])
17501 0.08724232763051987
torch.Size([13569, 1])
17502 0.08723653852939606
torch.Size([13569, 1])
17503 0.08723150193691254
torch.Size([13569, 1])
17504 0.08722711354494095
torch.Size([13569, 1])
17505 0.08722297102212906
torch.Size([13569, 1])
17506 0.08721891045570374
torch.Size([13569, 1])
17507 0.08721505850553513
torch.Size([13569, 1])
17508 0.08721134811639786
torch.Size([13569, 1])
17509 0.08720771223306656
torch.Size([13569, 1])
17510 0.08720416575670242
torch.Size([13569, 1])
17511 0.08720067143440247
torch.Size([13569, 1])
17512 0.0871971994638443
torch.Size([13569, 1])
17513 0.08719372004270554
torch.Size([13569, 1])
17514 0.08719027042388916
torch.Size([13569, 1])
17515 0.08718682825565338
torch.Size([13569, 1])
17516 0.087183378636837
torch.Size([13569, 1])
17517 0.087179996073246
torch.Size([13569, 1])
17518 0.08717665076255798
torch.Size([13569, 1])
17519 0.08717331290245056
torch.Siz

In [205]:

with torch.no_grad():
    device = torch.device("cpu")  # Set the device to CPU
    xs, ys = X_train_tensor.to(device), y_train_tensor.to(device)
    model.to(device)  # Move the model to the same device
    y_pred = model(xs)
    y_pred = torch.where(y_pred > 0.5, torch.tensor(1).to(device), torch.tensor(0).to(device)).flatten()

    acc = (y_pred == ys).float().mean()
    print(acc.item())


0.9314613938331604


In [206]:
from sklearn.metrics import accuracy_score, classification_report

# Classification report
print("Classification Report:")
print(classification_report(ys, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.96      0.93      6773
           1       0.96      0.90      0.93      6796

    accuracy                           0.93     13569
   macro avg       0.93      0.93      0.93     13569
weighted avg       0.93      0.93      0.93     13569

