<a href="https://colab.research.google.com/github/apoorva-ppl/De-nile-Dreamers/blob/main/DataPreprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
print(os.path.exists("/content/MSRParaphraseCorpus.msi"))


True


In [None]:
from datasets import load_dataset

dataset = load_dataset("glue", "mrpc")
dataset.save_to_disk("mrpc_dataset")  # Saves in a usable format


Saving the dataset (0/1 shards):   0%|          | 0/3668 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/408 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1725 [00:00<?, ? examples/s]

In [None]:
import os

dir_path = "/content/mrpc_dataset/train"
print(os.listdir(dir_path))  # This will show all files in the folder


['dataset_info.json', 'data-00000-of-00001.arrow', 'state.json']


In [None]:
from datasets import load_from_disk

# Load dataset from saved directory
dataset = load_from_disk("mrpc_dataset")

# Convert to Pandas DataFrame
df = dataset["train"].to_pandas()

# Save as TSV file
df.to_csv("mrpc_dataset.tsv", sep="\t", index=False)

print("Dataset saved successfully as 'mrpc_dataset.tsv'!")
print(df.head())  # Display first few rows


Dataset saved successfully as 'mrpc_dataset.tsv'!
                                           sentence1  \
0  Amrozi accused his brother , whom he called " ...   
1  Yucaipa owned Dominick 's before selling the c...   
2  They had published an advertisement on the Int...   
3  Around 0335 GMT , Tab shares were up 19 cents ...   
4  The stock rose $ 2.11 , or about 11 percent , ...   

                                           sentence2  label  idx  
0  Referring to him as only " the witness " , Amr...      1    0  
1  Yucaipa bought Dominick 's in 1995 for $ 693 m...      0    1  
2  On June 10 , the ship 's owners had published ...      1    2  
3  Tab shares jumped 20 cents , or 4.6 % , to set...      0    3  
4  PG & E Corp. shares jumped $ 1.63 or 8 percent...      1    4  


In [29]:
import pandas as pd
import re
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load the dataset
file_path = "/content/mrpc_dataset.tsv"  # Update with the correct path
df = pd.read_csv(file_path, delimiter='\t', quoting=3)

# Drop missing values
df.dropna(inplace=True)
print(df.columns)
print(df.head())

# Text Cleaning
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

df['sentence1'] = df['sentence1'].apply(clean_text)
df['sentence2'] = df['sentence2'].apply(clean_text)

# Prepare inputs and labels
x = df[['sentence1', 'sentence2']]
y = df['label']  # Label: 1 = Paraphrase, 0 = Not a Paraphrase

# Split dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)


Index(['sentence1', 'sentence2', 'label', 'idx'], dtype='object')
                                           sentence1  \
0  "Amrozi accused his brother , whom he called "...   
1  Yucaipa owned Dominick 's before selling the c...   
2  They had published an advertisement on the Int...   
3  Around 0335 GMT , Tab shares were up 19 cents ...   
4  The stock rose $ 2.11 , or about 11 percent , ...   

                                           sentence2  label  idx  
0  "Referring to him as only "" the witness "" , ...      1    0  
1  Yucaipa bought Dominick 's in 1995 for $ 693 m...      0    1  
2  On June 10 , the ship 's owners had published ...      1    2  
3  Tab shares jumped 20 cents , or 4.6 % , to set...      0    3  
4  PG & E Corp. shares jumped $ 1.63 or 8 percent...      1    4  


In [30]:
MAX_VOCAB_SIZE = 10000
MAX_SEQUENCE_LENGTH = 50

tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE)
tokenizer.fit_on_texts(pd.concat([X_train['sentence1'], X_train['sentence2']]))

def text_to_sequence(sentences):
    return pad_sequences(tokenizer.texts_to_sequences(sentences), maxlen=MAX_SEQUENCE_LENGTH)

X_train_seq1 = text_to_sequence(X_train['sentence1'])
X_train_seq2 = text_to_sequence(X_train['sentence2'])
X_test_seq1 = text_to_sequence(X_test['sentence1'])
X_test_seq2 = text_to_sequence(X_test['sentence2'])

# Stack sentence pairs for training
X_train_combined = np.hstack((X_train_seq1, X_train_seq2))
X_test_combined = np.hstack((X_test_seq1, X_test_seq2))


In [32]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional

# Model Definition
model = Sequential([
    Embedding(MAX_VOCAB_SIZE, 128, input_length=MAX_SEQUENCE_LENGTH * 2),
    Bidirectional(LSTM(64, return_sequences=True)),
    Dropout(0.5),
    LSTM(32),
    Dense(32, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')  # Output layer (binary classification)
])

# Compile Model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train_combined, y_train, validation_data=(X_test_combined, y_test), epochs=5, batch_size=32)


Epoch 1/5
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 188ms/step - accuracy: 0.6440 - loss: 0.6531 - val_accuracy: 0.6717 - val_loss: 0.6068
Epoch 2/5
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 201ms/step - accuracy: 0.7619 - loss: 0.4889 - val_accuracy: 0.6158 - val_loss: 0.6716
Epoch 3/5
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 188ms/step - accuracy: 0.9087 - loss: 0.2812 - val_accuracy: 0.6362 - val_loss: 0.8308
Epoch 4/5
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 181ms/step - accuracy: 0.9540 - loss: 0.1466 - val_accuracy: 0.6540 - val_loss: 1.1711
Epoch 5/5
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 201ms/step - accuracy: 0.9841 - loss: 0.0683 - val_accuracy: 0.6308 - val_loss: 1.2805


<keras.src.callbacks.history.History at 0x7a91abfa3ed0>