In [1]:
# Question 1
import pandas as pd

In [2]:
file_path = 'urdu-sentiment-corpus-v1.tsv'
df = pd.read_csv(file_path, delimiter='\t')
print(df)

                                                 Tweet Class
0    میں نے ایٹم بم بنایا ھے ۔۔۔۔او بھائی ایٹم بمب ...     P
1    چندے سے انقلاب اور عمران خان وزیر اعظم نہیں بن...     N
2                             ٹویٹر کا خیال کیسے آیا ؟     O
3    سرچ انجن گوگل کے نائب صدر نے فضا میں ، 130,000...     P
4      ابھی تک اسکی لہریں کبھی کبھی آ جاتی ہیں یار :أْ     P
..                                                 ...   ...
995     اُس آدمی نے اِس سالار کو کافی معقول ٹپ دی ہے ۔     P
996  چچا غالب کی روح سے معذرت کے ساتھہم نے مانا کہ ...     P
997  واہ جناب واہ! اچھی رہی۔ جناب خود کو فرشتہ سمجو...     P
998  اسلام آباد :پی اے ٹی کا دھرنا ختم، صفائی کے کا...     P
999  دنیا نے کس کا راہ وفا میں دیا ہے ساتھتم بھی چل...     P

[1000 rows x 2 columns]


In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from keras.models import Sequential
from keras.layers import Dense, Embedding, SimpleRNN, GRU, LSTM, Bidirectional, Dropout
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences


# Split data into features and labels
X = df['Tweet']
y = df['Class']

# Binarization
y_binary = np.where(y == 'P', 1, 0)

# Checking if there are mixed or unknown targets in the labels
if np.any((y_binary != 0) & (y_binary != 1)):
    raise ValueError("Labels contain mixed or unknown targets.")

# Checking for missing values in the labels
missing_labels = np.isnan(y_binary)
if np.any(missing_labels):
    raise ValueError("Missing values found in labels.")

y_binary = np.expand_dims(y_binary, axis=-1)

# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
X = tokenizer.texts_to_sequences(X)
max_sequence_length = 100
X = pad_sequences(X, maxlen=max_sequence_length)

# train and test split
X_train, X_test, y_train, y_test = train_test_split(X, y_binary, test_size=0.25, random_state=42)


def create_model(model_type, num_layers, dropout_rate):
    model = Sequential()

    model.add(Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=100, input_length=max_sequence_length))


    # Adding layers based on the model
    if model_type == 'RNN':
        for _ in range(num_layers):
            model.add(SimpleRNN(64, return_sequences=True))
    elif model_type == 'GRU':
        for _ in range(num_layers):
            model.add(GRU(64, return_sequences=True))
    elif model_type == 'LSTM':
        for _ in range(num_layers):
            model.add(LSTM(64, return_sequences=True))
    elif model_type == 'BiLSTM':
        for _ in range(num_layers):
            model.add(Bidirectional(LSTM(64, return_sequences=True)))

    # Dropout layer for dropout rate
    model.add(Dropout(dropout_rate))

    # Output layer
    model.add(Dense(1, activation='sigmoid'))

    return model


results = []

# Hyperparameters
num_layers_options = [2, 3]
dropout_rate_options = [0.3, 0.7]


for model_type in ['RNN', 'GRU', 'LSTM', 'BiLSTM']:
    for num_layers in num_layers_options:
        for dropout_rate in dropout_rate_options:
            # Creating model
            model = create_model(model_type, num_layers, dropout_rate)
            model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

            # Training model
            model.fit(X_train, y_train, epochs=10, batch_size=64, verbose=0)

            # Model Evaluation
            y_pred_prob = model.predict(X_test)
            y_pred_mean = np.mean(y_pred_prob, axis=1)
            y_pred = (y_pred_mean > 0.5).astype(int)

            # Checking shapes of y_test and y_pred
            if y_test.shape != y_pred.shape:
                raise ValueError("Shapes of y_test and y_pred do not match.")
            # Checking for missing values in y_test
            if np.isnan(y_test).any():
                raise ValueError("Missing values found in y_test.")

            # Checking shapes of y_test and y_pred
            if y_test.shape != y_pred.shape:
                raise ValueError("Shapes of y_test and y_pred do not match.")

            # Calculating evaluation metrics
            accuracy = accuracy_score(y_test, y_pred)
            precision = precision_score(y_test, y_pred)
            recall = recall_score(y_test, y_pred)
            f1 = f1_score(y_test, y_pred)

            results.append({
                'Model': model_type,
                'Num Layers': num_layers,
                'Dropout Rate': dropout_rate,
                'Accuracy': accuracy,
                'Precision': precision,
                'Recall': recall,
                'F1-score': f1
            })


results_df = pd.DataFrame(results)
print(results_df)









  _warn_prf(average, modifier, msg_start, len(result))


     Model  Num Layers  Dropout Rate  Accuracy  Precision    Recall  F1-score
0      RNN           2           0.3     0.520   0.818182  0.070866  0.130435
1      RNN           2           0.7     0.516   0.875000  0.055118  0.103704
2      RNN           3           0.3     0.508   0.508065  0.992126  0.672000
3      RNN           3           0.7     0.496   1.000000  0.007874  0.015625
4      GRU           2           0.3     0.500   1.000000  0.015748  0.031008
5      GRU           2           0.7     0.520   0.705882  0.094488  0.166667
6      GRU           3           0.3     0.628   0.628788  0.653543  0.640927
7      GRU           3           0.7     0.492   0.000000  0.000000  0.000000
8     LSTM           2           0.3     0.532   0.613636  0.212598  0.315789
9     LSTM           2           0.7     0.632   0.610063  0.763780  0.678322
10    LSTM           3           0.3     0.564   0.650000  0.307087  0.417112
11    LSTM           3           0.7     0.568   0.702128  0.259

In [None]:
# Question 2
# Using RNN with 3 layers and 0.3 dropout rate as it has highest f1-score

In [4]:
#Word2vec
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
from gensim.models import KeyedVectors
from keras.models import Sequential
from keras.layers import Dense, Embedding, SimpleRNN, Dropout
from keras.preprocessing.sequence import pad_sequences

# Load the pre-trained Word2Vec embeddings
word2vec_model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True, limit=20000)

# Load the dataset
file_path = 'urdu-sentiment-corpus-v1.tsv'
df = pd.read_csv(file_path, delimiter='\t')

# Split data into features and labels
X = df['Tweet']
y = df['Class']

# Convert labels to binary format (0 for Negative, 1 for Positive)
y_binary = np.where(y == 'P', 1, 0)

# Tokenizing the text data and convert to sequences
max_sequence_length = 100
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
X_sequences = tokenizer.texts_to_sequences(X)

# Pad sequences to make them of equal length
X_padded = pad_sequences(X_sequences, maxlen=max_sequence_length)

X_train, X_test, y_train, y_test = train_test_split(X_padded, y_binary, test_size=0.25, random_state=42)

# Creating an embedding matrix using Word2Vec embeddings
embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, 300))
for word, i in tokenizer.word_index.items():
    if word in word2vec_model:
        embedding_matrix[i] = word2vec_model[word]

# Defining the RNN model
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=300, weights=[embedding_matrix], input_length=max_sequence_length, trainable=False))
model.add(SimpleRNN(64, return_sequences=False))  # Return sequences only for the last layer
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))  # Output layer for binary classification

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=64, verbose=1)

# Predict probabilities for the test set
y_pred_prob = model.predict(X_test)

# Threshold for classification
threshold = 0.5
y_pred = (y_pred_prob > threshold).astype(int)

# Calculate precision, recall, and F1-score
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print the evaluation metrics
print("Evaluation metrics:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

results = []
for threshold in np.arange(0.1, 0.6, 0.1):
    y_pred = (y_pred_prob > threshold).astype(int)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    results.append({'Threshold': threshold, 'Precision': precision, 'Recall': recall, 'F1-score': f1})


results_df = pd.DataFrame(results)

# Print results
print(results_df)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Evaluation metrics:
Accuracy: 0.492
Precision: 0.5
Recall: 0.007874015748031496
F1-score: 0.015503875968992248
   Threshold  Precision    Recall  F1-score
0        0.1   0.510040  1.000000  0.675532
1        0.2   0.512295  0.984252  0.673854
2        0.3   0.512295  0.984252  0.673854
3        0.4   0.512295  0.984252  0.673854
4        0.5   0.500000  0.007874  0.015504


In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from keras.models import Sequential
from keras.layers import Dense, Embedding, SimpleRNN, Dropout
from keras.preprocessing.sequence import pad_sequences

# Load the GloVe embeddings
def load_glove_embeddings(file_path):
    embeddings_index = {}
    with open(file_path, encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

glove_embeddings_path = 'glove.6B.300d.txt'  # Adjust the path to your GloVe embeddings file
glove_embeddings = load_glove_embeddings(glove_embeddings_path)

# Load the dataset
file_path = 'urdu-sentiment-corpus-v1.tsv'
df = pd.read_csv(file_path, delimiter='\t')

# Split data into features and labels
X = df['Tweet']
y = df['Class']

# Convert labels to binary format (0 for Negative, 1 for Positive)
y_binary = np.where(y == 'P', 1, 0)

# Tokenizing the text data and convert to sequences
max_sequence_length = 100
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
X_sequences = tokenizer.texts_to_sequences(X)

# Pad sequences to make them of equal length
X_padded = pad_sequences(X_sequences, maxlen=max_sequence_length)

X_train, X_test, y_train, y_test = train_test_split(X_padded, y_binary, test_size=0.25, random_state=42)

# Creating an embedding matrix using GloVe embeddings
embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, 300))
for word, i in tokenizer.word_index.items():
    embedding_vector = glove_embeddings.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

# Defining the RNN model
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=300, weights=[embedding_matrix], input_length=max_sequence_length, trainable=False))
model.add(SimpleRNN(64, return_sequences=False))  # Return sequences only for the last layer
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))  # Output layer for binary classification

# Compiling the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Training the model
model.fit(X_train, y_train, epochs=10, batch_size=64, verbose=1)

# Predicting probabilities for the test set
y_pred_prob = model.predict(X_test)

# Threshold for classification
threshold = 0.5
y_pred = (y_pred_prob > threshold).astype(int)

# Calculate precision, recall, and F1-score
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print the evaluation metrics
print("Evaluation metrics:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

results = []
for threshold in np.arange(0.1, 0.6, 0.1):
    y_pred = (y_pred_prob > threshold).astype(int)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    results.append({'Threshold': threshold, 'Precision': precision, 'Recall': recall, 'F1-score': f1})

results_df = pd.DataFrame(results)

# Print results
print(results_df)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Evaluation metrics:
Accuracy: 0.492
Precision: 0.5
Recall: 0.09448818897637795
F1-score: 0.15894039735099336
   Threshold  Precision    Recall  F1-score
0        0.1   0.508000  1.000000  0.673740
1        0.2   0.510288  0.976378  0.670270
2        0.3   0.514768  0.960630  0.670330
3        0.4   0.506667  0.897638  0.647727
4        0.5   0.500000  0.094488  0.158940


In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from gensim.models import KeyedVectors
from keras.models import Sequential
from keras.layers import Dense, Embedding, SimpleRNN, Dropout
from keras.preprocessing.sequence import pad_sequences

# Load the FastText embeddings
fasttext_model = KeyedVectors.load_word2vec_format('wiki-news-300d-1M.vec', binary=False, limit=20000)  # Adjust the path

# Load the dataset
file_path = 'urdu-sentiment-corpus-v1.tsv'
df = pd.read_csv(file_path, delimiter='\t')

# Split data into features and labels
X = df['Tweet']
y = df['Class']

# Convert labels to binary format (0 for Negative, 1 for Positive)
y_binary = np.where(y == 'P', 1, 0)

# Tokenizing the text data and convert to sequences
max_sequence_length = 100
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
X_sequences = tokenizer.texts_to_sequences(X)

# Pad sequences to make them of equal length
X_padded = pad_sequences(X_sequences, maxlen=max_sequence_length)

X_train, X_test, y_train, y_test = train_test_split(X_padded, y_binary, test_size=0.25, random_state=42)

# Creating an embedding matrix using FastText embeddings
embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, 300))
for word, i in tokenizer.word_index.items():
    if word in fasttext_model:
        embedding_matrix[i] = fasttext_model[word]

# Defining the RNN model
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=300, weights=[embedding_matrix], input_length=max_sequence_length, trainable=False))
model.add(SimpleRNN(64, return_sequences=False))  # Return sequences only for the last layer
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))  # Output layer for binary classification

# Compiling the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=64, verbose=1)

# Predicting probabilities for the test set
y_pred_prob = model.predict(X_test)

# Threshold for classification
threshold = 0.5
y_pred = (y_pred_prob > threshold).astype(int)

# Calculate precision, recall, and F1-score
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print the evaluation metrics
print("Evaluation metrics:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

results = []
for threshold in np.arange(0.1, 0.6, 0.1):
    y_pred = (y_pred_prob > threshold).astype(int)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    results.append({'Threshold': threshold, 'Precision': precision, 'Recall': recall, 'F1-score': f1})


results_df = pd.DataFrame(results)

# Print results
print(results_df)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Evaluation metrics:
Accuracy: 0.52
Precision: 0.6842105263157895
Recall: 0.10236220472440945
F1-score: 0.1780821917808219
   Threshold  Precision    Recall  F1-score
0        0.1   0.508000  1.000000  0.673740
1        0.2   0.510040  1.000000  0.675532
2        0.3   0.516667  0.976378  0.675749
3        0.4   0.517391  0.937008  0.666667
4        0.5   0.684211  0.102362  0.178082


In [12]:
!pip install tensorflow_hub

Collecting tensorflow_hub
  Using cached tensorflow_hub-0.16.1-py2.py3-none-any.whl (30 kB)
Collecting tf-keras>=2.14.1
  Using cached tf_keras-2.16.0-py3-none-any.whl (1.7 MB)
Collecting tensorflow<2.17,>=2.16
  Using cached tensorflow-2.16.1-cp310-cp310-win_amd64.whl (2.1 kB)
Installing collected packages: tensorflow, tf-keras, tensorflow_hub
  Attempting uninstall: tensorflow
    Found existing installation: tensorflow 2.15.0
    Uninstalling tensorflow-2.15.0:
      Successfully uninstalled tensorflow-2.15.0
Successfully installed tensorflow-2.16.1 tensorflow_hub-0.16.1 tf-keras-2.16.0




In [21]:
!pip install --upgrade --force-reinstall tensorflow


Collecting tensorflow
  Using cached tensorflow-2.16.1-cp310-cp310-win_amd64.whl (2.1 kB)
Collecting tensorflow-intel==2.16.1
  Using cached tensorflow_intel-2.16.1-cp310-cp310-win_amd64.whl (376.9 MB)


ERROR: Could not install packages due to an OSError: [Errno 28] No space left on device



In [23]:
!pip install --upgrade tensorflow keras






In [16]:
!pip install allennlp



Collecting allennlp
  Using cached allennlp-2.10.1-py3-none-any.whl (730 kB)
Collecting transformers<4.21,>=4.1
  Using cached transformers-4.20.1-py3-none-any.whl (4.4 MB)
Collecting typer>=0.4.1
  Using cached typer-0.9.0-py3-none-any.whl (45 kB)
Collecting base58>=2.1.1
  Using cached base58-2.1.1-py3-none-any.whl (5.6 kB)
Collecting filelock<3.8,>=3.3
  Using cached filelock-3.7.1-py3-none-any.whl (10 kB)
Collecting sacremoses
  Using cached sacremoses-0.1.1-py3-none-any.whl (897 kB)
Collecting more-itertools>=8.12.0
  Using cached more_itertools-10.2.0-py3-none-any.whl (57 kB)
Collecting tensorboardX>=1.2
  Using cached tensorboardX-2.6.2.2-py2.py3-none-any.whl (101 kB)
Collecting protobuf<4.0.0,>=3.12.0
  Using cached protobuf-3.20.3-cp310-cp310-win_amd64.whl (904 kB)
Collecting fairscale==0.4.6
  Using cached fairscale-0.4.6-py3-none-any.whl
Collecting torchvision<0.14.0,>=0.8.1
  Using cached torchvision-0.13.1-cp310-cp310-win_amd64.whl (1.1 MB)
Collecting cached-path<1.2.0,>=1



In [27]:
!pip install --upgrade tensorflow-hub





In [30]:
!pip install tensorflow-text

Collecting tensorflow-text
  Using cached tensorflow_text-2.10.0-cp310-cp310-win_amd64.whl (5.0 MB)
Collecting tensorflow<2.11,>=2.10.0
  Downloading tensorflow-2.10.1-cp310-cp310-win_amd64.whl (455.9 MB)
     ------------------------------------ 455.9/455.9 MB 273.8 kB/s eta 0:00:00


ERROR: Could not install packages due to an OSError: [Errno 2] No such file or directory: 'c:\\users\\anasb\\anaconda3\\lib\\site-packages\\protobuf-3.20.3.dist-info\\METADATA'



Collecting tensorflow-text
  Using cached tensorflow_text-2.10.0-cp310-cp310-win_amd64.whl (5.0 MB)
Installing collected packages: tensorflow-text
Successfully installed tensorflow-text-2.10.0


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow-intel 2.16.1 requires keras>=3.0.0, but you have keras 2.10.0 which is incompatible.
tensorflow-intel 2.16.1 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3, but you have protobuf 3.19.6 which is incompatible.
tensorflow-intel 2.16.1 requires tensorboard<2.17,>=2.16, but you have tensorboard 2.10.1 which is incompatible.
tensorboardx 2.6.2.2 requires protobuf>=3.20, but you have protobuf 3.19.6 which is incompatible.


Collecting tensorflow-text
  Downloading tensorflow_text-2.10.0-cp310-cp310-win_amd64.whl (5.0 MB)
     ---------------------------------------- 5.0/5.0 MB 1.7 MB/s eta 0:00:00
Collecting tensorflow<2.11,>=2.10.0
  Downloading tensorflow-2.10.1-cp310-cp310-win_amd64.whl (455.9 MB)
     ------------------------------------- 455.9/455.9 MB 37.7 kB/s eta 0:00:00
Collecting protobuf>=3.19.6
  Downloading protobuf-3.19.6-cp310-cp310-win_amd64.whl (895 kB)
     ------------------------------------ 895.7/895.7 kB 244.3 kB/s eta 0:00:00
Collecting gast<=0.4.0,>=0.2.1
  Downloading gast-0.4.0-py3-none-any.whl (9.8 kB)
Collecting tensorboard<2.11,>=2.10
  Downloading tensorboard-2.10.1-py3-none-any.whl (5.9 MB)
     ---------------------------------------- 5.9/5.9 MB 393.7 kB/s eta 0:00:00
Collecting keras-preprocessing>=1.1.1
  Downloading Keras_Preprocessing-1.1.2-py2.py3-none-any.whl (42 kB)
     --------------------------------------- 42.6/42.6 kB 76.6 kB/s eta 0:00:00
Collecting keras<2.11,

In [34]:
!pip install --upgrade tensorflow tensorflow-text

Collecting tensorflow
  Using cached tensorflow-2.16.1-cp310-cp310-win_amd64.whl (2.1 kB)
Collecting tensorboard<2.17,>=2.16
  Using cached tensorboard-2.16.2-py3-none-any.whl (5.5 MB)
Collecting keras>=3.0.0
  Using cached keras-3.0.5-py3-none-any.whl (1.0 MB)
Collecting protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3
  Using cached protobuf-4.25.3-cp310-abi3-win_amd64.whl (413 kB)
Collecting tensorflow
  Using cached tensorflow-2.10.1-cp310-cp310-win_amd64.whl (455.9 MB)


ERROR: Could not install packages due to an OSError: [Errno 28] No space left on device



In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_text as text
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from keras.models import Sequential
from keras.layers import Dense, Embedding, SimpleRNN, Dropout
#from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load the dataset
file_path = 'urdu-sentiment-corpus-v1.tsv'
df = pd.read_csv(file_path, delimiter='\t')

# Split data into features and labels
X = df['Tweet']
y = df['Class']

# Convert labels to binary format (0 for Negative, 1 for Positive)
y_binary = np.where(y == 'P', 1, 0)

# Tokenizing the text data and convert to sequences
max_sequence_length = 100
tokenizer = text.keras_tokenizer_from_dataset(X)
X_sequences = tokenizer(X)

# Pad sequences to make them of equal length
X_padded = pad_sequences(X_sequences, maxlen=max_sequence_length)

X_train, X_test, y_train, y_test = train_test_split(X_padded, y_binary, test_size=0.25, random_state=42)

# Defining the RNN model
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.get_vocabulary()), output_dim=300, input_length=max_sequence_length))
model.add(SimpleRNN(64, return_sequences=True))  # Return sequences for all layers
model.add(Dropout(0.3))
model.add(SimpleRNN(64, return_sequences=True))  # Return sequences for all layers
model.add(Dropout(0.3))
model.add(SimpleRNN(64, return_sequences=False))  # Return sequences only for the last layer
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))  # Output layer for binary classification

# Compiling the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Training the model
model.fit(X_train, y_train, epochs=10, batch_size=64, verbose=1)

# Predicting probabilities for the test set
y_pred_prob = model.predict(X_test)

# Threshold for classification
threshold = 0.5
y_pred = (y_pred_prob > threshold).astype(int)

# Calculate precision, recall, and F1-score
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print the evaluation metrics
print("Evaluation metrics:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)


AttributeError: module 'tensorflow_text' has no attribute 'keras_tokenizer_from_dataset'