# **Lab** : Get familiar with NLP language models using Pytorch library

## Part1 Classification Regression :

Import Libraries :

In [None]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
import numpy as np

In [None]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...

[nltk_data]   Unzipping tokenizers/punkt.zip.

[nltk_data] Downloading package stopwords to /root/nltk_data...

[nltk_data]   Unzipping corpora/stopwords.zip.


True

Scrapping and Cleaning ans Scoring the texts :

In [None]:
# Define keywords
keywords = ['خامنئي', 'الإيراني', 'إيران']
stop_words = set(stopwords.words('arabic'))

In [None]:
# Function to clean text
def clean_text(txt):
    # Remove HTML tags and non-Arabic characters
    clean = re.sub('<[^<]+?>', '', txt)
    # Define regex pattern to match Arabic alphabet and spaces
    pattern = re.compile(r'[^\u0600-\u06FF\s]+')
    # Substitute non-matching characters with an empty string
    clean = re.sub(pattern, '', clean)
    # Remove extra spaces
    clean = re.sub(r'\s+', ' ', clean)
    return clean

# Function to extract and clean text from soup object
def extract_and_clean_text(soup):
    # Find all h1 and p tags
    tags = soup.find_all(['h1', 'p'])
    # Extract text from tags
    extracted_text = ''
    for tag in tags:
        extracted_text += tag.get_text() + '\n'
    # Clean extracted text
    cleaned_text = clean_text(extracted_text)
    return cleaned_text

In [None]:
# Function to scrape and process multiple URLs
def scrape_and_process(urls):
    texts = []
    for url in urls:
        req = requests.get(url)
        soup = BeautifulSoup(req.text, "html.parser")
        cleaned_text = extract_and_clean_text(soup)
        texts.append(cleaned_text)
    return texts

In [None]:
# Scoring function based on keyword frequency and text length
def score_text(text, keywords, stop_words):
    words = word_tokenize(text)
    filtered_words = [word for word in words if word not in stop_words]
    keyword_count = sum(1 for word in filtered_words if word in keywords)

    if len(filtered_words) == 0:
        return 0.0

    # Adjust the weight of keyword count and normalize the score to be between 0 and 10
    keyword_score = min((keyword_count / len(filtered_words)) * 100, 10)

    return round(keyword_score, 1)

# Function to apply scoring to a DataFrame
def apply_scoring(df, keywords, stop_words):
    df['Score'] = df['Text'].apply(lambda text: score_text(text, keywords, stop_words))
    return df

In [None]:
# List of URLs to scrape
urls = [
    'https://www.dw.com/ar/%D8%AA%D8%BA%D8%B7%D9%8A%D8%A9-%D9%85%D8%A8%D8%A7%D8%B4%D8%B1%D8%A9-%D9%84%D9%88%D9%81%D8%A7%D8%A9-%D8%A7%D9%84%D8%B1%D8%A6%D9%8A%D8%B3-%D8%A7%D9%84%D8%A5%D9%8A%D8%B1%D8%A7%D9%86%D9%8A-%D8%A5%D8%A8%D8%B1%D8%A7%D9%87%D9%8A%D9%85-%D8%B1%D8%A6%D9%8A%D8%B3%D9%8A-%D9%81%D9%8A-%D8%AD%D8%A7%D8%AF%D8%AB-%D8%AA%D8%AD%D8%B7%D9%85-%D9%85%D8%B1%D9%88%D8%AD%D9%8A%D8%A9/a-69127965',
    'https://www.aljazeera.net/news/2024/5/20/%D8%A5%D9%8A%D8%B1%D8%A7%D9%86-%D8%AA%D8%B9%D9%84%D9%86-%D9%85%D8%B5%D8%B1%D8%B9-%D8%A7%D9%84%D8%B1%D8%A6%D9%8A%D8%B3-%D8%A5%D8%A8%D8%B1%D8%A7%D9%87%D9%8A%D9%85-%D8%B1%D8%A6%D9%8A%D8%B3%D9%8A',
    'https://asharq.com/politics/88723/%D9%85%D9%86-%D9%81%D9%82%D8%AF%D8%A7%D9%86-%D8%A7%D9%84%D8%A7%D8%AA%D8%B5%D8%A7%D9%84-%D8%AD%D8%AA%D9%89-%D9%88%D9%81%D8%A7%D8%A9-%D8%A7%D9%84%D8%B1%D8%A6%D9%8A%D8%B3-%D8%AA%D9%81%D8%A7%D8%B5%D9%8A%D9%84-%D8%B3%D8%A7%D8%B9%D8%A7%D8%AA-%D8%AD%D8%B1%D8%AC%D8%A9-%D8%A5%D9%8A%D8%B1%D8%A7%D9%86/',
    'https://www.aljazeera.net/politics/2024/5/23/%d9%85%d8%b3%d8%aa%d9%88%d9%89-%d8%a7%d9%84%d9%88%d9%81%d9%88%d8%af-%d8%a7%d9%84%d9%85%d8%b4%d8%a7%d8%b1%d9%83%d8%a9-%d8%a8%d8%ac%d9%86%d8%a7%d8%b2%d8%a9-%d8%a7%d9%84%d8%b1%d8%a6%d9%8a%d8%b3',
    'https://www.aljazeera.net/news/2024/5/23/%d9%82%d8%a7%d8%af%d8%a9-%d9%85%d8%ad%d9%88%d8%b1-%d8%a7%d9%84%d9%85%d9%82%d8%a7%d9%88%d9%85%d8%a9-%d9%8a%d8%ac%d8%aa%d9%85%d8%b9%d9%88%d9%86-%d9%81%d9%8a-%d8%a5%d9%8a%d8%b1%d8%a7%d9%86',
    'https://www.aljazeera.net/news/2024/5/23/%d8%a7%d9%84%d9%82%d8%a7%d8%a6%d9%85-%d8%a8%d8%a3%d8%b9%d9%85%d8%a7%d9%84-%d8%a7%d9%84%d8%ae%d8%a7%d8%b1%d8%ac%d9%8a%d8%a9-%d8%a7%d9%84%d8%a5%d9%8a%d8%b1%d8%a7%d9%86%d9%8a%d8%a9-2',
    'https://www.aljazeera.net/politics/2024/5/23/%d8%ad%d9%83%d8%a7%d9%8a%d8%a9-%d9%88%d8%af%d9%84%d8%a7%d9%84%d8%a9-%d9%87%d9%83%d8%b0%d8%a7-%d8%a7%d9%82%d8%aa%d8%b1%d9%86-%d8%b1%d9%82%d9%85-8-%d8%a8%d8%a7%d9%84%d8%b1%d8%a6%d9%8a%d8%b3',
    'https://www.aljazeera.net/news/2024/5/23/%d8%b1%d8%a6%d9%8a%d8%b3%d9%8a-%d9%88%d8%b9%d8%a8%d8%af-%d8%a7%d9%84%d9%84%d9%87%d9%8a%d8%a7%d9%86-%d9%8a%d9%88%d8%a7%d8%b1%d9%8a%d8%a7%d9%86-%d8%a7%d9%84%d8%ab%d8%b1%d9%89-%d9%88%d8%a3%d9%88%d9%84',
    'https://www.aljazeera.net/news/2024/5/22/%d8%ad%d9%85%d8%a7%d8%b3-%d9%88%d8%ad%d8%b2%d8%a8-%d8%a7%d9%84%d9%84%d9%87-%d9%88%d8%af%d9%88%d9%84-%d8%b9%d8%b1%d8%a8%d9%8a%d8%a9-%d8%aa%d8%b4%d8%a7%d8%b1%d9%83-%d9%81%d9%8a',
    'https://www.aljazeera.net/politics/2024/5/22/%d8%a3%d8%b3%d8%a8%d8%a7%d8%a8-%d8%a7%d8%b7%d9%85%d8%a6%d9%86%d8%a7%d9%86-%d8%a5%d8%af%d8%a7%d8%b1%d8%a9-%d8%a8%d8%a7%d9%8a%d8%af%d9%86-%d9%84%d8%a7%d9%86%d8%aa%d9%82%d8%a7%d9%84',
    'https://www.aljazeera.net/news/2024/5/22/%d8%a7%d9%84%d8%b1%d8%a6%d8%a7%d8%b3%d8%a9-%d8%a7%d9%84%d8%a5%d9%8a%d8%b1%d8%a7%d9%86%d9%8a%d8%a9-%d8%aa%d9%83%d8%b4%d9%81-%d8%aa%d9%81%d8%a7%d8%b5%d9%8a%d9%84',
    'https://www.aljazeera.net/politics/2024/5/22/%d8%a3%d9%88%d8%b1%d8%a8%d8%a7%d9%86-%d8%aa%d8%b1%d9%85%d8%a8-%d9%84%d9%88%d9%83%d8%a7%d8%b4%d9%8a%d9%86%d9%83%d9%88-%d9%85%d9%88%d9%82%d8%b9-%d8%b1%d9%88%d8%b3%d9%8a-%d9%85%d9%86',
    'https://www.aljazeera.net/news/2024/5/22/%d8%a7%d9%84%d9%85%d8%b1%d8%b4%d8%af-%d8%a7%d9%84%d8%a5%d9%8a%d8%b1%d8%a7%d9%86%d9%8a-%d9%8a%d8%aa%d9%82%d8%af%d9%85-%d9%85%d8%b1%d8%a7%d8%b3%d9%85-%d8%aa%d8%b4%d9%8a%d9%8a%d8%b9',
    'https://www.aljazeera.net/news/2024/5/21/%d8%a5%d9%8a%d8%b1%d8%a7%d9%86-%d8%aa%d8%b4%d9%8a%d8%b9-%d8%b1%d8%a6%d9%8a%d8%b3%d9%8a-%d9%81%d9%8a-%d9%82%d9%85-%d9%88%d8%ba%d8%af%d8%a7-%d9%81%d9%8a-%d8%b7%d9%87%d8%b1%d8%a7%d9%86',
    'https://aljazeera.net/ebusiness/2024/5/21/%D8%B1%D8%BA%D9%85-%D8%A7%D9%84%D8%B9%D9%82%D9%88%D8%A8%D8%A7%D8%AA-%D9%83%D9%8A%D9%81-%D8%A3%D8%B9%D8%A7%D8%AF-%D8%A5%D8%A8%D8%B1%D8%A7%D9%87%D9%8A%D9%85-%D8%B1%D8%A6%D9%8A%D8%B3%D9%8A',
    'https://www.aljazeera.net/politics/2024/5/21/%d9%85%d8%a7-%d8%ad%d8%af%d9%88%d8%af-%d8%aa%d8%ba%d9%8a%d8%b1-%d8%b3%d9%8a%d8%a7%d8%b3%d8%a9-%d8%a5%d9%8a%d8%b1%d8%a7%d9%86-%d8%a7%d9%84%d8%ae%d8%a7%d8%b1%d8%ac%d9%8a%d8%a9-%d8%a8%d8%b9%d8%af',
    'https://www.aljazeera.net/news/2024/5/21/%d8%a5%d9%8a%d8%b1%d8%a7%d9%86-%d8%aa%d8%a8%d8%af%d8%a3-%d9%85%d8%b1%d8%a7%d8%b3%d9%85-%d8%aa%d8%b4%d9%8a%d9%8a%d8%b9-%d8%b1%d8%a6%d9%8a%d8%b3%d9%87%d8%a7-%d9%88%d9%88%d8%b2%d9%8a%d8%b1',
]

In [None]:
# Scrape and process URLs
texts = scrape_and_process(urls)

In [None]:
# Create a DataFrame with the scraped texts
df = pd.DataFrame({'Text': texts})

# Apply the scoring function to the DataFrame
df = apply_scoring(df, keywords, stop_words)

# Print the DataFrame with scores
df

Unnamed: 0,Text,Score
0,تغطية مباشرة لوفاة الرئيس الإيراني في حادث تح...,3.2
1,إيران تعلن وفاة الرئيس إبراهيم رئيسي ومرافقيه ...,3.3
2,من فقدان الاتصال حتى إعلان وفاة الرئيس تفاصيل ...,2.4
3,جنازة رئيسي حضور دولي واسع وغياب الأسد يثير تس...,1.9
4,قادة محور المقاومة يجتمعون في إيران خلال جنازة...,1.2
5,باقري إيران ستستمر في إستراتيجيتها لدعم المقاو...,3.0
6,هكذا اقترن رقم بالرئيس الإيراني الراحل طهران ي...,2.0
7,رئيسي يوارى الثرى اليوم في مسقط رأسه بمشهد يوا...,2.2
8,حماس وحزب الله ودول عربية تشارك في مراسم عزاء ...,3.3
9,لا تتوقع تغييرا كبيرا واشنطن تراقب نقل السلطة ...,1.5


In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize NLP tools
stop_words = set(stopwords.words('arabic'))
stemmer = SnowballStemmer('arabic')
lemmatizer = WordNetLemmatizer()

# Define preprocessing function
def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text)

    # Stop word removal
    tokens = [word for word in tokens if word not in stop_words]

    # Stemming
    stemmed_tokens = [stemmer.stem(word) for word in tokens]

    # Join tokens back into a string
    preprocessed_text = ' '.join(stemmed_tokens)

    return preprocessed_text

# Apply preprocessing to the DataFrame
df['Preprocessed_Text'] = df['Text'].apply(preprocess_text)

# Display the DataFrame with preprocessed text
df

[nltk_data] Downloading package punkt to /root/nltk_data...

[nltk_data]   Package punkt is already up-to-date!

[nltk_data] Downloading package stopwords to /root/nltk_data...

[nltk_data]   Package stopwords is already up-to-date!

[nltk_data] Downloading package wordnet to /root/nltk_data...

[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,Text,Score,Preprocessed_Text
0,تغطية مباشرة لوفاة الرئيس الإيراني في حادث تح...,3.2,تغط مباشر لوفا رييس ايران حادث تحطم مروح نعت س...
1,إيران تعلن وفاة الرئيس إبراهيم رئيسي ومرافقيه ...,3.3,ايرا تعل وفا رييس ابراهيم رييس مرافق اثر تحطم ...
2,من فقدان الاتصال حتى إعلان وفاة الرئيس تفاصيل ...,2.4,قدا اتصال اعلا وفا رييس تفاصيل ساع حرج عاش اير...
3,جنازة رئيسي حضور دولي واسع وغياب الأسد يثير تس...,1.9,جناز رييس حضور دول واسع غياب اسد يثير تساول طه...
4,قادة محور المقاومة يجتمعون في إيران خلال جنازة...,1.2,قاد محور مقاوم يجتمع ايرا خلال جناز رييس عقد ق...
5,باقري إيران ستستمر في إستراتيجيتها لدعم المقاو...,3.0,اقر ايرا ستستمر استراتيج لدعم مقاوم قال قايم ا...
6,هكذا اقترن رقم بالرئيس الإيراني الراحل طهران ي...,2.0,اقتر رقم رييس ايران راحل طهرا يلف متابع صحاف ا...
7,رئيسي يوارى الثرى اليوم في مسقط رأسه بمشهد يوا...,2.2,رييس يوارى ثرى يوم مسقط راس مشهد يوارى ثرى يوم...
8,حماس وحزب الله ودول عربية تشارك في مراسم عزاء ...,3.3,حماس حزب الله دول عرب تشار مراسم عزاء رييس انض...
9,لا تتوقع تغييرا كبيرا واشنطن تراقب نقل السلطة ...,1.5,تتوقع تغيير كبير واشنط تراقب نقل سلط ايرا حذر ...


RNN :

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from gensim.models import Word2Vec
import torch.nn.functional as F

class TextDataset(Dataset):
    def __init__(self, texts, scores, word2vec_model, max_length=100):
        self.texts = texts
        self.scores = scores
        self.word2vec_model = word2vec_model
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        tokenized_text = preprocess_text(self.texts[idx]).split()
        embeddings = [self.word2vec_model.wv[word] for word in tokenized_text if word in self.word2vec_model.wv]

        # Pad embeddings if necessary
        padded_embeddings = self.pad_sequence(embeddings)

        return torch.tensor(padded_embeddings), torch.tensor(self.scores[idx])

    def pad_sequence(self, embeddings):
        if len(embeddings) >= self.max_length:
            return embeddings[:self.max_length]
        else:
            num_padding = self.max_length - len(embeddings)
            padded_embeddings = embeddings + [torch.zeros_like(embeddings[0]) for _ in range(num_padding)]
            return padded_embeddings

# Train Word2Vec model
tokenized_texts = [preprocess_text(text).split() for text in df['Text']]
word2vec_model = Word2Vec(tokenized_texts, vector_size=100, window=5, min_count=1, sg=0)

# Save the trained Word2Vec model for later use
word2vec_model.save("word2vec_model.bin")

# Create dataset and dataloader
dataset = TextDataset(df['Text'], df['Score'], word2vec_model)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# Define RNN model
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        h0 = torch.zeros(1, x.size(0), self.hidden_size).to(x.device)
        out, _ = self.rnn(x, h0)
        out = self.fc(out[:, -1, :])
        return out

# Initialize model, loss function, and optimizer
input_size = word2vec_model.vector_size
hidden_size = 128
output_size = 1  # Regression task
model = RNN(input_size, hidden_size, output_size)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop with MSE score calculation
num_epochs = 10
for epoch in range(num_epochs):
    total_loss = 0.0
    total_samples = 0

    for inputs, labels in dataloader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels.unsqueeze(1).float())  # Reshape labels for regression
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * inputs.size(0)  # Multiply by batch size to account for varying batch sizes
        total_samples += inputs.size(0)

    mse_score = total_loss / total_samples  # Calculate MSE score
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {mse_score:.4f}')

Epoch [1/10], Loss: 6.7296

Epoch [2/10], Loss: 6.3240

Epoch [3/10], Loss: 5.9313

Epoch [4/10], Loss: 5.5184

Epoch [5/10], Loss: 5.0493

Epoch [6/10], Loss: 4.4798

Epoch [7/10], Loss: 3.7584

Epoch [8/10], Loss: 2.8638

Epoch [9/10], Loss: 1.9443

Epoch [10/10], Loss: 1.3360


Bidirectional RNN:

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from gensim.models import Word2Vec
import torch.nn.functional as F

# Define preprocess_text function
def preprocess_text(text):
    # Your preprocessing logic here
    return text

class TextDataset(Dataset):
    def __init__(self, texts, scores, word2vec_model, max_length=100):
        self.texts = texts
        self.scores = scores
        self.word2vec_model = word2vec_model
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        tokenized_text = preprocess_text(self.texts[idx]).split()
        embeddings = [self.word2vec_model.wv[word] for word in tokenized_text if word in self.word2vec_model.wv]

        # Pad embeddings if necessary
        padded_embeddings = self.pad_sequence(embeddings)

        return torch.tensor(padded_embeddings), torch.tensor(self.scores[idx])

    def pad_sequence(self, embeddings):
        if len(embeddings) >= self.max_length:
            return embeddings[:self.max_length]
        else:
            num_padding = self.max_length - len(embeddings)
            padded_embeddings = embeddings + [torch.zeros_like(embeddings[0]) for _ in range(num_padding)]
            return padded_embeddings

# Train Word2Vec model
tokenized_texts = [preprocess_text(text).split() for text in df['Text']]
word2vec_model = Word2Vec(tokenized_texts, vector_size=100, window=5, min_count=1, sg=0)

# Save the trained Word2Vec model for later use
word2vec_model.save("word2vec_model.bin")

# Create dataset and dataloader
dataset = TextDataset(df['Text'], df['Score'], word2vec_model)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# Define BiRNN model
class BiRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(BiRNN, self).__init__()
        self.hidden_size = hidden_size
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_size * 2, output_size)  # Multiply by 2 for bidirectional

    def forward(self, x):
        h0 = torch.zeros(2, x.size(0), self.hidden_size).to(x.device)  # 2 for bidirectional
        out, _ = self.rnn(x, h0)
        out = self.fc(out[:, -1, :])  # Concatenating forward and backward outputs
        return out

# Initialize model, loss function, and optimizer
input_size = word2vec_model.vector_size
hidden_size = 128
output_size = 1  # Regression task
model = BiRNN(input_size, hidden_size, output_size)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop with MSE score calculation
num_epochs = 10
for epoch in range(num_epochs):
    total_loss = 0.0
    total_samples = 0

    for inputs, labels in dataloader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels.unsqueeze(1).float())  # Reshape labels for regression
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * inputs.size(0)  # Multiply by batch size to account for varying batch sizes
        total_samples += inputs.size(0)

    mse_score = total_loss / total_samples  # Calculate MSE score
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {mse_score:.4f}')

Epoch [1/10], Loss: 5.7339

Epoch [2/10], Loss: 5.4059

Epoch [3/10], Loss: 5.0619

Epoch [4/10], Loss: 4.6813

Epoch [5/10], Loss: 4.2392

Epoch [6/10], Loss: 3.6981

Epoch [7/10], Loss: 3.0216

Epoch [8/10], Loss: 2.2467

Epoch [9/10], Loss: 1.5755

Epoch [10/10], Loss: 1.2278


GRU :

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from gensim.models import Word2Vec

# Define preprocess_text function
def preprocess_text(text):
    # Your preprocessing logic here
    return text

class TextDataset(Dataset):
    def __init__(self, texts, scores, word2vec_model, max_length=100):
        self.texts = texts
        self.scores = scores
        self.word2vec_model = word2vec_model
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        tokenized_text = preprocess_text(self.texts[idx]).split()
        embeddings = [self.word2vec_model.wv[word] for word in tokenized_text if word in self.word2vec_model.wv]

        # Pad embeddings if necessary
        padded_embeddings = self.pad_sequence(embeddings)

        return torch.tensor(padded_embeddings), torch.tensor(self.scores[idx])

    def pad_sequence(self, embeddings):
        if len(embeddings) >= self.max_length:
            return embeddings[:self.max_length]
        else:
            num_padding = self.max_length - len(embeddings)
            padded_embeddings = embeddings + [torch.zeros_like(embeddings[0]) for _ in range(num_padding)]
            return padded_embeddings

# Train Word2Vec model
tokenized_texts = [preprocess_text(text).split() for text in df['Text']]
word2vec_model = Word2Vec(tokenized_texts, vector_size=100, window=5, min_count=1, sg=0)

# Save the trained Word2Vec model for later use
word2vec_model.save("word2vec_model.bin")

# Create dataset and dataloader
dataset = TextDataset(df['Text'], df['Score'], word2vec_model)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# Define GRU model
class GRU(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(GRU, self).__init__()
        self.hidden_size = hidden_size
        self.gru = nn.GRU(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        h0 = torch.zeros(1, x.size(0), self.hidden_size).to(x.device)
        out, _ = self.gru(x, h0)
        out = self.fc(out[:, -1, :])
        return out

# Initialize model, loss function, and optimizer
input_size = word2vec_model.vector_size
hidden_size = 128
output_size = 1  # Regression task
model = GRU(input_size, hidden_size, output_size)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop with MSE score calculation
num_epochs = 50
for epoch in range(num_epochs):
    total_loss = 0.0
    total_samples = 0

    for inputs, labels in dataloader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels.unsqueeze(1).float())  # Reshape labels for regression
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * inputs.size(0)  # Multiply by batch size to account for varying batch sizes
        total_samples += inputs.size(0)

    mse_score = total_loss / total_samples  # Calculate MSE score
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {mse_score:.4f}')

Epoch [1/50], Loss: 5.6204

Epoch [2/50], Loss: 5.4475

Epoch [3/50], Loss: 5.2762

Epoch [4/50], Loss: 5.1047

Epoch [5/50], Loss: 4.9311

Epoch [6/50], Loss: 4.7536

Epoch [7/50], Loss: 4.5696

Epoch [8/50], Loss: 4.3760

Epoch [9/50], Loss: 4.1686

Epoch [10/50], Loss: 3.9421

Epoch [11/50], Loss: 3.6897

Epoch [12/50], Loss: 3.4021

Epoch [13/50], Loss: 3.0666

Epoch [14/50], Loss: 2.6658

Epoch [15/50], Loss: 2.1808

Epoch [16/50], Loss: 1.6167

Epoch [17/50], Loss: 1.1791

Epoch [18/50], Loss: 1.7473

Epoch [19/50], Loss: 1.9227

Epoch [20/50], Loss: 1.6022

Epoch [21/50], Loss: 1.2661

Epoch [22/50], Loss: 1.1658

Epoch [23/50], Loss: 1.2269

Epoch [24/50], Loss: 1.3195

Epoch [25/50], Loss: 1.3867

Epoch [26/50], Loss: 1.4152

Epoch [27/50], Loss: 1.4071

Epoch [28/50], Loss: 1.3695

Epoch [29/50], Loss: 1.3122

Epoch [30/50], Loss: 1.2477

Epoch [31/50], Loss: 1.1909

Epoch [32/50], Loss: 1.1571

Epoch [33/50], Loss: 1.1557

Epoch [34/50], Loss: 1.1820

Epoch [35/50], Loss: 1.

LSTM :

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from gensim.models import Word2Vec

# Define preprocess_text function
def preprocess_text(text):
    # Your preprocessing logic here
    return text

class TextDataset(Dataset):
    def __init__(self, texts, scores, word2vec_model, max_length=100):
        self.texts = texts
        self.scores = scores
        self.word2vec_model = word2vec_model
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        tokenized_text = preprocess_text(self.texts[idx]).split()
        embeddings = [self.word2vec_model.wv[word] for word in tokenized_text if word in self.word2vec_model.wv]

        # Pad embeddings if necessary
        padded_embeddings = self.pad_sequence(embeddings)

        return torch.tensor(padded_embeddings), torch.tensor(self.scores[idx])

    def pad_sequence(self, embeddings):
        if len(embeddings) >= self.max_length:
            return embeddings[:self.max_length]
        else:
            num_padding = self.max_length - len(embeddings)
            padded_embeddings = embeddings + [torch.zeros_like(embeddings[0]) for _ in range(num_padding)]
            return padded_embeddings

# Train Word2Vec model
tokenized_texts = [preprocess_text(text).split() for text in df['Text']]
word2vec_model = Word2Vec(tokenized_texts, vector_size=100, window=5, min_count=1, sg=0)

# Save the trained Word2Vec model for later use
word2vec_model.save("word2vec_model.bin")

# Create dataset and dataloader
dataset = TextDataset(df['Text'], df['Score'], word2vec_model)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# Define LSTM model
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(LSTM, self).__init__()
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        h0 = torch.zeros(1, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(1, x.size(0), self.hidden_size).to(x.device)
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

# Initialize model, loss function, and optimizer
input_size = word2vec_model.vector_size
hidden_size = 128
output_size = 1  # Regression task
model = LSTM(input_size, hidden_size, output_size)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop with MSE score calculation
num_epochs = 50
for epoch in range(num_epochs):
    total_loss = 0.0
    total_samples = 0

    for inputs, labels in dataloader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels.unsqueeze(1).float())  # Reshape labels for regression
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * inputs.size(0)  # Multiply by batch size to account for varying batch sizes
        total_samples += inputs.size(0)

    mse_score = total_loss / total_samples  # Calculate MSE score
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {mse_score:.4f}')

Epoch [1/50], Loss: 6.0673

Epoch [2/50], Loss: 5.9652

Epoch [3/50], Loss: 5.8617

Epoch [4/50], Loss: 5.7548

Epoch [5/50], Loss: 5.6420

Epoch [6/50], Loss: 5.5203

Epoch [7/50], Loss: 5.3858

Epoch [8/50], Loss: 5.2331

Epoch [9/50], Loss: 5.0542

Epoch [10/50], Loss: 4.8358

Epoch [11/50], Loss: 4.5561

Epoch [12/50], Loss: 4.1740

Epoch [13/50], Loss: 3.5987

Epoch [14/50], Loss: 2.6003

Epoch [15/50], Loss: 1.3368

Epoch [16/50], Loss: 1.3215

Epoch [17/50], Loss: 1.6670

Epoch [18/50], Loss: 1.7798

Epoch [19/50], Loss: 1.7166

Epoch [20/50], Loss: 1.5736

Epoch [21/50], Loss: 1.4199

Epoch [22/50], Loss: 1.2960

Epoch [23/50], Loss: 1.2188

Epoch [24/50], Loss: 1.1872

Epoch [25/50], Loss: 1.1900

Epoch [26/50], Loss: 1.2124

Epoch [27/50], Loss: 1.2413

Epoch [28/50], Loss: 1.2671

Epoch [29/50], Loss: 1.2844

Epoch [30/50], Loss: 1.2914

Epoch [31/50], Loss: 1.2886

Epoch [32/50], Loss: 1.2781

Epoch [33/50], Loss: 1.2623

Epoch [34/50], Loss: 1.2441

Epoch [35/50], Loss: 1.

## Part 2 Transformer (Text generation) :

In [5]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import numpy as np
import pandas as pd

import logging
logging.getLogger().setLevel(logging.CRITICAL)

import warnings
warnings.filterwarnings('ignore')

device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'

In [6]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
model = GPT2LMHeadModel.from_pretrained('gpt2-medium')
model = model.to(device)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [7]:
def choose_from_top(probs, n=5):
    ind = np.argpartition(probs, -n)[-n:]
    top_prob = probs[ind]
    top_prob = top_prob / np.sum(top_prob) # Normalize
    choice = np.random.choice(n, 1, p = top_prob)
    token_id = ind[choice][0]
    return int(token_id)

In [31]:
# Load and Preprocess the Dataset
df = pd.read_csv("/kaggle/input/articles-csv/Articles.csv", encoding="ISO-8859-1")

In [7]:
df =df.head(500)

In [32]:
df.head(4)

Unnamed: 0,Article,Date,Heading,NewsType
0,KARACHI: The Sindh government has decided to b...,1/1/2015,sindh govt decides to cut public transport far...,business
1,HONG KONG: Asian markets started 2015 on an up...,1/2/2015,asia stocks up in new year trad,business
2,HONG KONG: Hong Kong shares opened 0.66 perce...,1/5/2015,hong kong stocks open 0.66 percent lower,business
3,HONG KONG: Asian markets tumbled Tuesday follo...,1/6/2015,asian stocks sink euro near nine year,business


In [35]:
df.drop(['Date', 'Heading', 'NewsType'], axis='columns', inplace=True)

In [36]:
df.head(4)

Unnamed: 0,Article
0,KARACHI: The Sindh government has decided to b...
1,HONG KONG: Asian markets started 2015 on an up...
2,HONG KONG: Hong Kong shares opened 0.66 perce...
3,HONG KONG: Asian markets tumbled Tuesday follo...


In [8]:
from torch.utils.data import Dataset
from torch.utils.data import Dataset, DataLoader
import os
import json
import csv

class ArticleDataset(Dataset):
    def __init__(self, articles_dataset_path = "/kaggle/input/out-csv/"):
        super().__init__()

        articles_path = os.path.join(articles_dataset_path, 'out.csv')

        self.article_list = []
        self.end_of_text_token = "<|endoftext|>"

        with open(articles_path) as csv_file:
            csv_reader = csv.reader(csv_file, delimiter=',')

            x = 0
            for row in csv_reader:
                article_str = f"ARTICLE:{row[1]}{self.end_of_text_token}"
                self.article_list.append(article_str)

    def __len__(self):
        return len(self.article_list)

    def __getitem__(self, item):
        return self.article_list[item]

In [9]:
dataset = ArticleDataset()
article_loader = DataLoader(dataset, batch_size=1, shuffle=True)

In [10]:
BATCH_SIZE = 16
EPOCHS = 5
LEARNING_RATE = 3e-5
WARMUP_STEPS = 5000
MAX_SEQ_LEN = 1024
from transformers import AdamW, get_linear_schedule_with_warmup

device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'

Training :

In [12]:
model = model.to(device)
model.train()
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=WARMUP_STEPS, num_training_steps=-1)
proc_seq_count = 0
sum_loss = 0.0
batch_count = 0

tmp_jokes_tens = None
models_folder = "trained_models"
if not os.path.exists(models_folder):
    os.mkdir(models_folder)

for epoch in range(EPOCHS):

    print(f"EPOCH {epoch} started" + '=' * 30)

    for idx,joke in enumerate(article_loader):

        joke_tens = torch.tensor(tokenizer.encode(joke[0])).unsqueeze(0).to(device)
        #Skip sample from dataset if it is longer than MAX_SEQ_LEN
        if joke_tens.size()[1] > MAX_SEQ_LEN:
            continue

        if not torch.is_tensor(tmp_jokes_tens):
            tmp_jokes_tens = joke_tens
            continue
        else:
            if tmp_jokes_tens.size()[1] + joke_tens.size()[1] > MAX_SEQ_LEN:
                work_jokes_tens = tmp_jokes_tens
                tmp_jokes_tens = joke_tens
            else:
                tmp_jokes_tens = torch.cat([tmp_jokes_tens, joke_tens[:,1:]], dim=1)
                continue

        outputs = model(work_jokes_tens, labels=work_jokes_tens)
        loss, logits = outputs[:2]
        loss.backward()
        sum_loss = sum_loss + loss.detach().data

        proc_seq_count = proc_seq_count + 1
        if proc_seq_count == BATCH_SIZE:
            proc_seq_count = 0
            batch_count += 1
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
            model.zero_grad()

        if batch_count == 100:
            print(f"sum loss {sum_loss}")
            batch_count = 0
            sum_loss = 0.0

    # Store the model after each epoch to compare the performance of them
    torch.save(model.state_dict(), os.path.join(models_folder, f"gpt2_medium_joker_{epoch}.pt"))



Generate text :

In [13]:
MODEL_EPOCH = 4

models_folder = "trained_models"

model_path = os.path.join(models_folder, f"gpt2_medium_joker_{MODEL_EPOCH}.pt")
model.load_state_dict(torch.load(model_path))

jokes_output_file_path = f'generated_{MODEL_EPOCH}.articles'

model.eval()
if os.path.exists(jokes_output_file_path):
    os.remove(jokes_output_file_path)
    
joke_num = 0
with torch.no_grad():
   
        for joke_idx in range(100):
        
            joke_finished = False

            cur_ids = torch.tensor(tokenizer.encode("Article:")).unsqueeze(0).to(device)

            for i in range(100):
                outputs = model(cur_ids, labels=cur_ids)
                loss, logits = outputs[:2]
                softmax_logits = torch.softmax(logits[0,-1], dim=0) #Take the first(from only one in this case) batch and the last predicted embedding
                if i < 3:
                    n = 20
                else:
                    n = 3
                next_token_id = choose_from_top(softmax_logits.to('cpu').numpy(), n=n) #Randomly(from the topN probability distribution) select the next word
                cur_ids = torch.cat([cur_ids, torch.ones((1,1)).long().to(device) * next_token_id], dim = 1) # Add the last word to the running sequence

                if next_token_id in tokenizer.encode('<|endoftext|>'):
                    joke_finished = True
                    break

            
            if joke_finished:
                
                joke_num = joke_num + 1
                
                output_list = list(cur_ids.squeeze().to('cpu').numpy())
                output_text = tokenizer.decode(output_list)

                with open(jokes_output_file_path, 'a') as f:
                    f.write(f"{output_text} \n\n")

2024-05-26 15:39:35.407830: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-26 15:39:35.407932: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-26 15:39:35.536118: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Display Generated text :

In [14]:
# Define the file path
file_path = "/kaggle/working/generated_4.articles"

# Open the file in read mode
with open(file_path, 'r') as file:
    # Read the content of the file
    file_content = file.read()

# Print the content of the file
print(file_content)

Article:

https://www.facebook.com/pages/MAD-MEMBERSHIP-FRIENDLY-HOMECOMES/16554548981489<|endoftext|> 

Article: A Tale of Two Cities - Part One

A Tale of Two Cities - Part Two

A Tale of Two Cities - Part Three

A Tale of Two Cities - Part Four

A Tale of Two Cities - Part Five

A Tale of Two Cities - Part Six

A Tale of Two Cities - Part Seven<|endoftext|> 

Article: This article may not be reproduced in any form without permission of The Guardian.<|endoftext|> 

Article: The new "S.O.S." song

The song is a parody of the song "S.O.S.," which is about a woman who has a crush on a guy.

The song is a spoof of the popular video game "S.O.S. 2."

The song is a parody of the song "S.O.S. 2," which is about a woman who has a crush on a guy.<|endoftext|> 

Article: In the future, you'll be able to buy your way into the top 10 of the world's richest men

The world's richest men are getting richer and richer, but the world's richest women are still struggling for recognition and respect.<|

## Part 3 BERT :

In [1]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, f1_score
import torch

2024-05-26 17:17:34.832880: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-26 17:17:34.833011: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-26 17:17:34.950300: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
import pandas as pd

dataset_path = "/kaggle/input/amazon/AMAZON_FASHION_5.json"
data = pd.read_json(dataset_path, lines=True)

data.head(5)

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image
0,5,True,"09 4, 2015",ALJ66O1Y6SLHA,B000K2PJ4K,"{'Size:': ' Big Boys', 'Color:': ' Blue/Orange'}",Tonya B.,Great product and price!,Five Stars,1441324800,,
1,5,True,"09 4, 2015",ALJ66O1Y6SLHA,B000K2PJ4K,"{'Size:': ' Big Boys', 'Color:': ' Black (3746...",Tonya B.,Great product and price!,Five Stars,1441324800,,
2,5,True,"09 4, 2015",ALJ66O1Y6SLHA,B000K2PJ4K,"{'Size:': ' Big Boys', 'Color:': ' Blue/Gray L...",Tonya B.,Great product and price!,Five Stars,1441324800,,
3,5,True,"09 4, 2015",ALJ66O1Y6SLHA,B000K2PJ4K,"{'Size:': ' Big Boys', 'Color:': ' Blue (37867...",Tonya B.,Great product and price!,Five Stars,1441324800,,
4,5,True,"09 4, 2015",ALJ66O1Y6SLHA,B000K2PJ4K,"{'Size:': ' Big Boys', 'Color:': ' Blue/Pink'}",Tonya B.,Great product and price!,Five Stars,1441324800,,


Preprocessing and selecting columns to work with :

In [4]:
data = data[['reviewText', 'overall']].dropna()

In [5]:
# Clean text data: remove special characters and convert to lowercase
def clean_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text, re.I|re.A)
    text = text.lower()
    return text

In [6]:
data['reviewText'] = data['reviewText'].apply(clean_text)

In [7]:
data['reviewText'] = data['reviewText'].astype(str)
data['overall'] = data['overall'] - 1

In [8]:
train_texts, test_texts, train_labels, test_labels = train_test_split(data['reviewText'].tolist(), data['overall'].tolist(), test_size=0.2, random_state=42 )

Load tokenizer, model... :

In [9]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=512)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [10]:
class DatasetAm(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = DatasetAm(train_encodings, train_labels)
test_dataset = DatasetAm(test_encodings, test_labels)

In [11]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

In [13]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted')
    return {
        'accuracy': acc,
        'f1': f1,
    }

In [14]:
trainer = Trainer(
    model=model,                        
    args=training_args,                  
    train_dataset=train_dataset,         
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Training :

In [15]:
trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc




Step,Training Loss
10,1.6021
20,1.4753
30,1.3936
40,1.305
50,1.2656
60,1.2179
70,1.1138
80,1.1345
90,1.0099
100,1.0118


TrainOutput(global_step=474, training_loss=0.5172992299629163, metrics={'train_runtime': 277.5428, 'train_samples_per_second': 27.326, 'train_steps_per_second': 1.708, 'total_flos': 1095179933354688.0, 'train_loss': 0.5172992299629163, 'epoch': 3.0})

Evaluation :

In [16]:
eval_result = trainer.evaluate()

In [17]:
accuracy = eval_result['eval_accuracy']
f1 = eval_result['eval_f1']

In [19]:
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")

Accuracy: 0.9794
F1 Score: 0.9792


### Conclusion on Using Pre-Trained BERT Models for Text Classification :
The use of pre-trained BERT (Bidirectional Encoder Representations from Transformers) models for text classification tasks has proven to be highly effective and versatile. Here are several key takeaways from the process and results of implementing a BERT model for classifying Amazon Fashion reviews:

1. High Performance :
    * Accuracy and F1 Score : The model achieved impressive results with an accuracy of 97.94% and an F1 score of 97.92%. These metrics indicate that the model is highly accurate and performs well in predicting the correct labels for new, unseen data. The high F1 score also suggests that the model maintains a good balance between precision and recall, handling both false positives and false negatives effectively.


2. Advanced Natural Language Understanding :

    * Contextual Embeddings : BERT's ability to generate contextual embeddings for words means that it can understand the context in which words appear. This is particularly useful in reviews where sentiment can be context-dependent. For instance, BERT can distinguish between the use of words like "great" in "great quality" versus "great disappointment".
    
    
3. Transfer Learning Efficiency :

    * Pre-trained Models : Leveraging a pre-trained BERT model significantly reduces the time and computational resources required compared to training a model from scratch. Pre-trained models have already been exposed to vast amounts of data and can capture complex linguistic patterns, which can then be fine-tuned on specific tasks with relatively smaller datasets.
    
    
4. Fine-Tuning Capability :

    * Adaptability to Specific Tasks : The fine-tuning process allows the pre-trained BERT model to adapt to specific tasks with high efficiency. By adjusting the pre-trained weights during fine-tuning, the model becomes specialized in the given classification task, improving its performance on domain-specific data.
    
    
5. Complexity and Computational Cost :

    * Resource Intensive : Despite its high performance, BERT models are computationally expensive and require significant resources for both training and inference. This includes the need for powerful GPUs and substantial memory, which might not be feasible for all applications, particularly those with limited resources.
    
    
6. Text Preprocessing and Tokenization :

    *  Robust Tokenization : The BERT tokenizer effectively handles various aspects of text preprocessing, such as truncating and padding sentences to a uniform length. This standardization is crucial for ensuring that the model can process inputs efficiently and consistently.

In conclusion, pre-trained BERT models offer a powerful tool for text classification tasks, delivering high accuracy and robustness through advanced natural language understanding capabilities. While they require substantial computational resources, the benefits of high performance and adaptability to specific tasks make them a valuable asset in the field of natural language processing.








# Summary of Learning from Lab 4: NLP Language Models using Pytorch

### Objective :
The primary goal of Lab 4 was to become acquainted with Natural Language Processing (NLP) models using the Pytorch library. This involved tasks ranging from text data collection to model training and evaluation.

## Part 1 : Classification Regression


1. Data Collection :
    * Utilized web scraping libraries (Scrapy/BeautifulSoup) to gather Arabic text data from various websites centered on a specific topic.
    * Prepared a dataset where each text had an associated relevance score (0-10).
    
    
2. Data Preprocessing :
    * Established an NLP preprocessing pipeline involving tokenization, stemming, lemmatization, stop-word removal, and text discretization.
    * Implemented text cleaning functions to remove HTML tags, non-Arabic characters, and unnecessary spaces.


3. Model Training :
    * Trained several models including RNN, Bidirectional RNN, GRU, and LSTM architectures.
    * Tuned hyperparameters to enhance model performance.
    * Developed a custom dataset class for handling text and score data, leveraging Word2Vec embeddings for text representation.
    
    
4. Model Evaluation :
    * Evaluated models using standard metrics like accuracy, loss, and additional metrics like BLEU score.
    * Implemented a training loop to compute Mean Squared Error (MSE) loss and track model performance over epochs.


## Part 2 : Transformer (Text Generation)


1. Model Setup:
    * Installed Pytorch-transformers and loaded the pre-trained GPT2 model.
    
    
2. Fine-Tuning:
    * Fine-tuned the GPT2 model on a customized dataset.
    
    
3. Text Generation:
    * Used the fine-tuned model to generate new paragraphs based on given sentences, enhancing understanding of transformer-based text generation.


## Part 3 : BERT


1. Model Establishment:
    * Used the pre-trained bert-base-uncased model.
    * Adapted the BERT embedding layer to fit the dataset.
    
    
2. Data Preparation and Fine-Tuning:
    * Prepared data and fine-tuned the BERT model by selecting optimal hyperparameters.
    
    
3. Model Evaluation:
    * Evaluated the fine-tuned BERT model using standard metrics (accuracy, loss, F1 score) and additional metrics such as BLEU score and BERT-specific metrics.
    
    
4. Conclusion:

    * Provided a general conclusion on the efficacy of using pre-trained BERT models for NLP tasks.


## Key Learnings :

   * Data Preprocessing : Gained hands-on experience with Arabic text data preprocessing, including tokenization, stop-word removal, stemming, and lemmatization.
   * Model Training : Learned to implement and train various RNN-based models and transformers like GPT2 and BERT.
   * Fine-Tuning : Understood the process of fine-tuning pre-trained models on custom datasets.
   * Evaluation Metrics : Became proficient in evaluating models using both standard and advanced NLP metrics.