# Phase 1: Text Preprocessing

### 1. Stopword Removal

In [6]:
import pandas as pd

# Load the dataset
file_path = 'urdu_sarcastic_dataset.csv'
df = pd.read_csv(file_path)

# Display the dataset
df.head()

Unnamed: 0,urdu_text,is_sarcastic,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7
0,🤣😂😂 ہو لینے دے میری شادی فسادن ٹھیک ہے کوجی نہ...,1.0,,,,,,
1,چل مہمانوں میں کھانا سرو کر چڑیل چاچی نوں دسدی...,1.0,,,,,,
2,کامران خان آپکی دن بھریہ زمہ داری لگائی گئی اپ...,0.0,,,,,,
3,نہیں پائین 😎,0.0,,,,,,
4,`` مراد علی شاہ کے بھیس میں ڈی جی آئی ایس آئی...,1.0,,,,,,


In [7]:
# Load the stopwords from the text file
with open('stopwords-ur.txt', 'r', encoding='utf-8') as file:
    urdu_stopwords = file.read().splitlines()

# Check the first few stopwords
print(urdu_stopwords[:10])

['آئی', 'آئے', 'آج', 'آخر', 'آخرکبر', 'آدهی', 'آًب', 'آٹھ', 'آیب', 'اة']


In [8]:
# Function to remove stopwords, handling non-string values
def remove_stopwords(text):
    if isinstance(text, str):  # Check if the text is a string
        words = text.split()  # Split text into words
        cleaned_text = ' '.join([word for word in words if word not in urdu_stopwords])
        return cleaned_text
    else:
        return text  # Return the text unchanged if it's not a string

# Apply the stopword removal to the 'urdu_text' column
df['cleaned_text'] = df['urdu_text'].apply(remove_stopwords)

# Display the cleaned text
df[['urdu_text', 'cleaned_text']].head()

Unnamed: 0,urdu_text,cleaned_text
0,🤣😂😂 ہو لینے دے میری شادی فسادن ٹھیک ہے کوجی نہ...,🤣😂😂 لینے میری شادی فسادن کوجی نہیں چاہیے 😐😐😐🤣
1,چل مہمانوں میں کھانا سرو کر چڑیل چاچی نوں دسدی...,چل مہمانوں میں کھانا سرو کر چڑیل چاچی نوں دسدی...
2,کامران خان آپکی دن بھریہ زمہ داری لگائی گئی اپ...,کامران خان آپکی دن بھریہ زمہ داری لگائی اپوزیش...
3,نہیں پائین 😎,نہیں پائین 😎
4,`` مراد علی شاہ کے بھیس میں ڈی جی آئی ایس آئی...,`` مراد علی شاہ بھیس میں ڈی جی ایس '' حامد میر😁


### 2. Punctuation, Emojis, and Hashtags:

In [9]:
import re
import string

# Function to clean the text
def clean_text(text):
    if isinstance(text, str):
        # 1. Remove URLs
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
        
        # 2. Remove Hashtags
        text = re.sub(r'#\w+', '', text)
        
        # 3. Remove Punctuation
        text = text.translate(str.maketrans('', '', string.punctuation))

        # 4. Remove all emojis
        text = re.sub(r'[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F700-\U0001F77F\U0001F900-\U0001F9FF\U0001FA00-\U0001FAFF]', '', text)

        return text.strip()
    else:
        return text  # Return unchanged if not a string

# Apply the clean_text function to the 'urdu_text' column
df['cleaned_text'] = df['urdu_text'].apply(clean_text)

# Display the cleaned text
df[['urdu_text', 'cleaned_text']].head()


Unnamed: 0,urdu_text,cleaned_text
0,🤣😂😂 ہو لینے دے میری شادی فسادن ٹھیک ہے کوجی نہ...,ہو لینے دے میری شادی فسادن ٹھیک ہے کوجی نہیں چ...
1,چل مہمانوں میں کھانا سرو کر چڑیل چاچی نوں دسدی...,چل مہمانوں میں کھانا سرو کر چڑیل چاچی نوں دسدی...
2,کامران خان آپکی دن بھریہ زمہ داری لگائی گئی اپ...,کامران خان آپکی دن بھریہ زمہ داری لگائی گئی اپ...
3,نہیں پائین 😎,نہیں پائین
4,`` مراد علی شاہ کے بھیس میں ڈی جی آئی ایس آئی...,مراد علی شاہ کے بھیس میں ڈی جی آئی ایس آئی تھے...


### 3. Short Conversations:

In [10]:
# Function to filter short conversations
def filter_short_conversations(text):
    if isinstance(text, str):
        # Count the number of words
        word_count = len(text.split())
        # Return the text if it has 3 or more words, otherwise return an empty string
        return text if word_count >= 3 else ''
    return text  # Return unchanged if not a string

# Apply the filter_short_conversations function to the 'cleaned_text' column
df['cleaned_text'] = df['cleaned_text'].apply(filter_short_conversations)

# Save the cleaned text to a separate CSV file
df[['cleaned_text']].to_csv('cleaned_urdu_text.csv', index=False, encoding='utf-8')
# Display the results
df[['urdu_text', 'cleaned_text']].head()

Unnamed: 0,urdu_text,cleaned_text
0,🤣😂😂 ہو لینے دے میری شادی فسادن ٹھیک ہے کوجی نہ...,ہو لینے دے میری شادی فسادن ٹھیک ہے کوجی نہیں چ...
1,چل مہمانوں میں کھانا سرو کر چڑیل چاچی نوں دسدی...,چل مہمانوں میں کھانا سرو کر چڑیل چاچی نوں دسدی...
2,کامران خان آپکی دن بھریہ زمہ داری لگائی گئی اپ...,کامران خان آپکی دن بھریہ زمہ داری لگائی گئی اپ...
3,نہیں پائین 😎,
4,`` مراد علی شاہ کے بھیس میں ڈی جی آئی ایس آئی...,مراد علی شاہ کے بھیس میں ڈی جی آئی ایس آئی تھے...


### 4. Some additional preprocessing techniques (normalization)

In [11]:
# Given text str, replace one or more spacings with a single space, 
# and one or more linebreaks with a single newline. 

from urduhack.preprocessing import normalize_whitespace

# Apply normalization to the 'cleaned_text' column
df['cleaned_text'] = df['cleaned_text'].apply(lambda x: normalize_whitespace(x) if pd.notnull(x) else x)


# Save the updated data to a new CSV file
df[['cleaned_text']].to_csv('normalized_dataset.csv', index=False, encoding='utf-8')

# Display the normalized text
df[['cleaned_text']].head()

2024-09-29 22:12:52.449409: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-09-29 22:12:52.453762: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-09-29 22:12:52.547268: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-09-29 22:12:52.548838: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.

TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a min

Unnamed: 0,cleaned_text
0,ہو لینے دے میری شادی فسادن ٹھیک ہے کوجی نہیں چ...
1,چل مہمانوں میں کھانا سرو کر چڑیل چاچی نوں دسدی...
2,کامران خان آپکی دن بھریہ زمہ داری لگائی گئی اپ...
3,
4,مراد علی شاہ کے بھیس میں ڈی جی آئی ایس آئی تھے...


In [19]:
# Removing some of the additional urdu punctuations from the text

from urduhack.preprocessing import remove_punctuation

# Ensure all entries in 'cleaned_text' are strings. Replace non-string entries with an empty string.
df['cleaned_text'] = df['cleaned_text'].apply(lambda x: str(x) if isinstance(x, str) else '')

# Apply remove_punctuation function
df['cleaned_text'] = df['cleaned_text'].apply(remove_punctuation)


# Save the updated data to a new CSV file
df[['cleaned_text']].to_csv('normalized_dataset.csv', index=False, encoding='utf-8')

# Display the first 20 rows of 'cleaned_text'
print(df['cleaned_text'].head())

0    ہو لینے دے میری شادی فسادن ٹھیک ہے کوجی نہیں چ...
1    چل مہمانوں میں کھانا سرو کر چڑیل چاچی نوں دسدی...
2    کامران خان آپکی دن بھریہ زمہ داری لگائی گئی اپ...
3                                                     
4    مراد علی شاہ کے بھیس میں ڈی جی آئی ایس آئی تھے...
Name: cleaned_text, dtype: object


In [12]:
# Remove accents from any accented unicode characters in text str, either by transforming them 
# into ascii equivalents or removing them entirely.

from urduhack.preprocessing import remove_accents

# Ensure all entries in 'cleaned_text' are strings. Replace non-string entries with an empty string.
df['cleaned_text'] = df['cleaned_text'].apply(lambda x: str(x) if isinstance(x, str) else '')

# Apply remove_punctuation function
df['cleaned_text'] = df['cleaned_text'].apply(remove_accents)


# Save the updated data to a new CSV file
df[['cleaned_text']].to_csv('normalized_dataset.csv', index=False, encoding='utf-8')

# Display the first 20 rows of 'cleaned_text'
print(df['cleaned_text'].head())

0    ہو لینے دے میری شادی فسادن ٹھیک ہے کوجی نہیں چ...
1    چل مہمانوں میں کھانا سرو کر چڑیل چاچی نوں دسدی...
2    کامران خان آپکی دن بھریہ زمہ داری لگائی گئی اپ...
3                                                     
4    مراد علی شاہ کے بھیس میں ڈی جی آئی ایس آئی تھے...
Name: cleaned_text, dtype: object


In [13]:
# This function replaces English digits with Urdu digits.
from LughaatNLP import LughaatNLP

# Initialize the LughaatNLP object
urdu_text_processing = LughaatNLP()

# Apply the replace_digits function to each row in the 'cleaned_text' column
df['cleaned_text'] = df['cleaned_text'].apply(lambda x: urdu_text_processing.replace_digits(x) if isinstance(x, str) else x)

# Save the updated data to a new CSV file
df[['cleaned_text']].to_csv('normalized_dataset.csv', index=False, encoding='utf-8')

# Display the result 'cleaned_text'
print(df['cleaned_text'].head())



0    ہو لینے دے میری شادی فسادن ٹھیک ہے کوجی نہیں چ...
1    چل مہمانوں میں کھانا سرو کر چڑیل چاچی نوں دسدی...
2    کامران خان آپکی دن بھریہ زمہ داری لگائی گئی اپ...
3                                                     
4    مراد علی شاہ کے بھیس میں ڈی جی آئی ایس آئی تھے...
Name: cleaned_text, dtype: object


In [14]:
# This function removes all non-Urdu characters, numbers, and special characters, just leaving only pure 
# Urdu text even not special character used in urdu.

# Apply the just_urdu function to each row in the 'cleaned_text' column
df['cleaned_text'] = df['cleaned_text'].apply(lambda x: urdu_text_processing.just_urdu(x) if isinstance(x, str) else x)

# Save the updated data to a new CSV file
df[['cleaned_text']].to_csv('normalized_dataset.csv', index=False, encoding='utf-8')

# Display the result 'cleaned_text'
print(df['cleaned_text'].head())

0    ہو لینے دے میری شادی فسادن ٹھیک ہے کوجی نہیں چ...
1    چل مہمانوں میں کھانا سرو کر چڑیل چاچی نوں دسدی...
2    کامران خان اپکی دن بھریہ زمہ داری لگائی گئی اپ...
3                                                     
4    مراد علی شاہ کے بھیس میں ڈی جی ائی ایس ائی تھے...
Name: cleaned_text, dtype: object


In [15]:
# # spell checker

# # Initialize the LughaatNLP object
# spell_checker = LughaatNLP()

# # Apply the spell_check function to each row in the 'cleaned_text' column
# df['cleaned_text'] = df['cleaned_text'].apply(lambda x: spell_checker.corrected_sentence_spelling(x, 60) if isinstance(x, str) else x)

# # Save the updated data to a new CSV file
# df[['cleaned_text']].to_csv('normalized_dataset.csv', index=False, encoding='utf-8')

# # Display the result 'cleaned_text'
# print(df['cleaned_text'].head())

# Phase 2: Stemming and Lemmatization

### 1. Stemming:

In [16]:
# Import LughaatNLP for stemming and lemmatization

# Function to apply stemming only
def apply_stemming(text):
    if isinstance(text, str):
        # Apply stemming
        stemmed_text = urdu_text_processing.urdu_stemmer(text)
        return stemmed_text
    else:
        return text

# Apply stemming to the 'cleaned_text' column
df['stemmed_text'] = df['cleaned_text'].apply(apply_stemming)

# Save the stemmed output to a new CSV file
df[['stemmed_text']].to_csv('stemmed_dataset.csv', index=False, encoding='utf-8')

# Display the first 20 rows of 'stemmed_text'
print(df['stemmed_text'].head())

0    ہو لینہ دہ میری شادی فسادن ٹھیک ہہ کوجی نہا چاہیہ
1    چل مہمانا ما کھانا سرو کر چڑیل چاچی نا دسدی اں ما
2    کامران خان اپکی دن بھریہ زمہ داری لگائی گئی اپ...
3                                                     
4    مراد علی شاہ کہ بھیس ما ڈی جی ائی ایس ائی تھہ ...
Name: stemmed_text, dtype: object


### 2. Lemmatization:

In [17]:
# Function to apply lemmatization only
def apply_lemmatization(text):
    if isinstance(text, str):
        # Apply lemmatization
        lemmatized_text = urdu_text_processing.lemmatize_sentence(text)
        return lemmatized_text
    else:
        return text

# Apply lemmatization to the 'cleaned_text' column
df['lemmatized_text'] = df['cleaned_text'].apply(apply_lemmatization)

# Save the lemmatized output to a new CSV file
df[['lemmatized_text']].to_csv('lemmatized_dataset.csv', index=False, encoding='utf-8')

# Display the first 20 rows of 'lemmatized_text'
print(df['lemmatized_text'].head())

0    ہونا لینا دینا میرا شادی فسادن ٹھیک ہونا کوجی ...
1    چلنا مہمان میں کھا سرا کرنا چڑیل چاچی نوں دسدی...
2    کامران خان اپکی دن بھریہ زمہ داری لگنا جانا اپ...
3                                                     
4    مراد علی شاہ کم بھیس میں ڈی جینا ائی ایس ائی ت...
Name: lemmatized_text, dtype: object


# Phase 3: Feature Extraction

### 1. Tokenization:

In [18]:
# This function tokenizes the Urdu text into individual tokens

# Apply the urdu_tokenize function to each row in the 'lemmatized_text' column
df['tokenized_text'] = df['lemmatized_text'].apply(
    lambda x: urdu_text_processing.urdu_tokenize(x) if isinstance(x, str) else x
)

# Save the tokenized data to a new CSV file
df[['tokenized_text']].to_csv('tokenized_dataset.csv', index=False, encoding='utf-8')

# Display the first 5 rows of the tokenized text
print(df['tokenized_text'].head())

0    [ہونا, لینا, دینا, میرا, شادی, فسادن, ٹھیک, ہو...
1    [چلنا, مہمان, میں, کھا, سرا, کرنا, چڑیل, چاچی,...
2    [کامران, خان, اپکی, دن, بھریہ, زمہ, داری, لگنا...
3                                                   []
4    [مراد, علی, شاہ, کم, بھیس, میں, ڈی, جینا, ائی,...
Name: tokenized_text, dtype: object


### 2. Tf-IDF (Term Frequency-Inverse Document Frequency):

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Join the tokenized text into a single string for each document
df['joined_text'] = df['tokenized_text'].apply(lambda tokens: ' '.join(tokens))

# Initialize the TfidfVectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the joined text to compute TF-IDF scores
tfidf_matrix = vectorizer.fit_transform(df['joined_text'])

# Get the words corresponding to the features
feature_names = vectorizer.get_feature_names_out()

# Convert the TF-IDF matrix to a DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)

# Sum the TF-IDF scores for each word across all documents
tfidf_sum = tfidf_df.sum().sort_values(ascending=False)

# Get the top 10 words with the highest TF-IDF scores
top_tfidf_words = tfidf_sum.head(10)

# Display the top 10 words and their TF-IDF scores
print("Top 10 words with highest TF-IDF scores:")
print(top_tfidf_words)

# Save the top TF-IDF words to a CSV file
top_tfidf_words.to_csv('top_tfidf_words.csv', header=True)

Top 10 words with highest TF-IDF scores:
ہونا    1379.796968
میں     1146.799896
کم       962.396668
کرنا     619.291894
کو       526.431342
سے       516.883089
نہیں     498.592380
کا       477.211109
بھی      446.767171
اور      441.691920
dtype: float64


### 3. Word2Vec:

In [30]:
from gensim.models import Word2Vec

# Load the fixed tokenized dataset
tokenized_sentences = df['tokenized_text'].tolist()

# Train a Word2Vec model
# Parameters: 
#   size=100: Vector size (embedding dimension)
#   window=5: Context window size
#   min_count=1: Ignores words with frequency lower than this
word2vec_model = Word2Vec(sentences=tokenized_sentences, vector_size=100, window=5, min_count=1, workers=4)

# Save the trained Word2Vec model for future use
word2vec_model.save("urdu_word2vec_fixed.model")

# finding similar words for "اچھا"
try:
    similar_words = word2vec_model.wv.most_similar("اچھا", topn=5)
    print("Top 5 words most similar to 'اچھا':", similar_words)

    # Save the similar words to a CSV file
    similar_words_df = pd.DataFrame(similar_words, columns=['Word', 'Similarity'])
    similar_words_df.to_csv('similar_words_acha.csv', index=False, encoding='utf-8')

except KeyError:
    print("Word 'اچھا' not found in vocabulary.")

Top 5 words most similar to 'اچھا': [('مشکل', 0.9648818969726562), ('افسوس', 0.9622622132301331), ('اچھی', 0.960913896560669), ('ہمیں', 0.959526002407074), ('مجھے', 0.9595121145248413)]


# Phase 4: N-grams Analysis

### 1. Unigram, Bigram, and Trigram Analysis:

In [33]:
import nltk
from nltk import FreqDist
from nltk.util import ngrams

# Load the tokenized dataset
df_tokenized = pd.read_csv('tokenized_dataset.csv')

# Extract the tokenized text column
tokenized_sentences = df_tokenized['tokenized_text'].apply(eval).tolist()  # Converting string representation of list to list

# Flatten the list of tokenized sentences for unigram analysis
all_tokens = [token for sentence in tokenized_sentences for token in sentence]

# Generate unigrams, bigrams, and trigrams
unigrams = all_tokens
bigrams = list(ngrams(all_tokens, 2))
trigrams = list(ngrams(all_tokens, 3))

# Calculate the frequency distribution of bigrams and trigrams
bigram_freq = FreqDist(bigrams)
trigram_freq = FreqDist(trigrams)

# Get the top 10 most common bigrams and trigrams
top_10_bigrams = bigram_freq.most_common(10)
top_10_trigrams = trigram_freq.most_common(10)

# Display the top 10 bigrams and trigrams
print("Top 10 Bigrams:")
for bigram, freq in top_10_bigrams:
    print(f"{bigram}: {freq}")

print("\nTop 10 Trigrams:")
for trigram, freq in top_10_trigrams:
    print(f"{trigram}: {freq}")

# Save the bigrams and trigrams to CSV files
bigrams_df = pd.DataFrame(top_10_bigrams, columns=['Bigram', 'Frequency'])
bigrams_df.to_csv('top_10_bigrams.csv', index=False, encoding='utf-8')

trigrams_df = pd.DataFrame(top_10_trigrams, columns=['Trigram', 'Frequency'])
trigrams_df.to_csv('top_10_trigrams.csv', index=False, encoding='utf-8')

Top 10 Bigrams:
('ہونا', 'میں'): 1863
('رہنا', 'ہونا'): 1607
('ہونا', 'ہونا'): 1015
('ہونا', 'کہنا'): 992
('ہونا', 'ت'): 807
('ہونا', 'اور'): 775
('کرنا', 'ہونا'): 745
('میں', 'نے'): 693
('نہیں', 'ہونا'): 618
('جا', 'ہونا'): 602

Top 10 Trigrams:
('کرنا', 'رہنا', 'ہونا'): 337
('ہونا', 'کہنا', 'میں'): 190
('رہنا', 'ہونا', 'میں'): 163
('ہونا', 'جانا', 'ہونا'): 141
('ہونا', 'ت', 'میں'): 126
('ہونا', 'میں', 'نے'): 124
('نواز', 'شریف', 'کم'): 122
('ائی', 'جینا', 'سندھ'): 118
('پینا', 'ٹی', 'ائی'): 115
('ہونا', 'جا', 'ہونا'): 112


# Phase 5: Sentiment Classification Model

### 1. Model Building:

In [39]:
import numpy as np
import pandas as pd
from gensim.models import Word2Vec

# Load the Word2Vec model
word2vec_model = Word2Vec.load("urdu_word2vec_fixed.model")

def sentence_to_vector(tokens):
    # Use tokens directly (assumed to be a list)
    vectors = []
    
    for word in tokens:
        if word in word2vec_model.wv:
            vectors.append(word2vec_model.wv[word])
    
    if vectors:  # Return the average vector if there are valid vectors
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(word2vec_model.vector_size)  # Return a zero vector if no valid words

# Create a new column for sentence vectors directly from the tokenized_text
df['sentence_vector'] = df['tokenized_text'].apply(sentence_to_vector)

# Display the first few sentence vectors
print(df['sentence_vector'].head())

0    [-0.17241704, 0.3244016, 0.20045978, 0.4228485...
1    [-0.22701998, 0.3297415, -0.0068548904, 0.1310...
2    [-0.15025176, 0.2084173, -0.031012284, 0.14914...
3    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
4    [-0.41087872, 0.62309057, 0.036108177, -0.1755...
Name: sentence_vector, dtype: object


In [44]:
# APplying logistic regression
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

# Load the Word2Vec model
word2vec_model = Word2Vec.load("urdu_word2vec_fixed.model")

def sentence_to_vector(tokens):
    vectors = []
    
    for word in tokens:
        if word in word2vec_model.wv:
            vectors.append(word2vec_model.wv[word])
    
    if vectors:  # Return the average vector if there are valid vectors
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(word2vec_model.vector_size)  # Return a zero vector if no valid words

# Create a new column for sentence vectors directly from the tokenized_text
df['sentence_vector'] = df['tokenized_text'].apply(sentence_to_vector)

# Check for NaN values in the target column
if df['is_sarcastic'].isnull().any():
    print("Found NaN values in the is_sarcastic column. Dropping those rows.")
    df = df.dropna(subset=['is_sarcastic'])

# Prepare the features and labels
X = np.array(df['sentence_vector'].tolist())
y = df['is_sarcastic'].values  # Use the correct column for labels

# Check for NaN values in the features
if np.isnan(X).any() or np.isnan(y).any():
    print("Found NaN values in the features or target. Removing rows with NaN values.")
    valid_indices = ~np.isnan(X).any(axis=1) & ~np.isnan(y)
    X = X[valid_indices]
    y = y[valid_indices]

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Found NaN values in the is_sarcastic column. Dropping those rows.
Accuracy: 0.7080729817545613
              precision    recall  f1-score   support

         0.0       0.72      0.72      0.72      2073
         1.0       0.70      0.70      0.70      1928

    accuracy                           0.71      4001
   macro avg       0.71      0.71      0.71      4001
weighted avg       0.71      0.71      0.71      4001



# Phase 6: Evaluation & Optimization

### 1. Evaluation:

Model Performance Metrics
After running your logistic regression model on the Urdu posts, you achieved the following evaluation metrics:

**Accuracy:** 70.81%

**Precision:**
- Class 0 (non-sarcastic): 72%
- Class 1 (sarcastic): 70%

**Recall:**
- Class 0 (non-sarcastic): 72%
- Class 1 (sarcastic): 70%

**F1-Score:**
- Class 0 (non-sarcastic): 72%
- Class 1 (sarcastic): 70%

### Analysis of Performance

**Strong Performance Areas:**
- **Basic Sentences:** The model performs relatively well on straightforward, structured sentences that do not contain idiomatic expressions or complex syntax.
- **Common Sarcasm Indicators:** The model is likely able to recognize certain phrases or words that are commonly associated with sarcasm, leading to better precision and recall in those instances.

**Struggling Areas:**
- **Complex Sentences:** The model may struggle with more complex sentence structures or those containing multiple clauses, leading to a decrease in recall for sarcastic posts.
- **Colloquial Language and Slang:** Informal language, slang, or regional dialects might not be well-represented in the training data, leading to misclassifications.
- **Contextual Understanding:** Sarcasm often relies heavily on context, tone, and cultural nuances, which may not be fully captured by the model.
- **Negations and Inversion:** Sentences where negations are present can create confusion for the model, affecting its ability to classify sentiment accurately.

### Areas for Improvement
- **Data Augmentation:** Enhance the dataset with more examples of complex sentences and sarcastic expressions.
- **Feature Engineering:** Explore features that capture contextual meaning, such as part-of-speech tagging or dependency parsing.
- **Advanced Models:** Consider using advanced machine learning models or deep learning approaches like LSTMs or transformers that can handle context better.
- **Fine-Tuning:** Experiment with hyperparameters and explore the use of pre-trained language models for Urdu, if available.


### 2. Challenges in Urdu Sentiment Analysis:

Key Challenges
### Complex Morphology:

- **Word Formation:** Urdu has a rich morphological structure with words often formed through the combination of root words, prefixes, and suffixes. This complexity makes tokenization and normalization challenging.
- **Inflection:** The inflectional nature of Urdu can lead to multiple forms of the same word, complicating feature extraction.

### Colloquial Language:

- **Slang and Dialects:** Urdu spoken in different regions may vary significantly, incorporating local slang and idiomatic expressions that are not present in formal written Urdu.
- **Code-Switching:** Many Urdu speakers mix English and Urdu in their posts, which can further complicate sentiment analysis if the model is not trained on such data.

### Noisy Data from Social Media:

- **Informal Text:** Posts on social media often contain abbreviations, typos, and non-standard grammar, which can hinder the model's ability to parse and understand the text accurately.
- **Emotionally Charged Language:** Users may express sentiments in varied ways, including sarcasm, irony, or hyperbole, making it challenging for models to accurately classify emotions.

### Lack of Resources:

- **Limited Datasets:** High-quality labeled datasets for Urdu sentiment analysis are scarce, which can limit the model's training effectiveness.
- **Tools and Libraries:** There may be fewer NLP libraries and tools available for Urdu compared to languages like English, limiting preprocessing and analysis capabilities.

# 🎉 The End 🎉<div style="text-align: center;">