# Data Loading 

In [1]:
import pandas as pd

data = pd.read_csv("/home/catpc/Downloads/Sarcasm.csv")
df = data[['tweet','sarcastic']]
df


Unnamed: 0,tweet,sarcastic
0,The only thing I got from college is a caffein...,1
1,I love it when professors draw a big question ...,1
2,Remember the hundred emails from companies whe...,1
3,Today my pop-pop told me I was not ‚Äúforced‚Äù to...,1
4,@VolphanCarol @littlewhitty @mysticalmanatee I...,1
...,...,...
3463,The population spike in Chicago in 9 months is...,0
3464,You'd think in the second to last English clas...,0
3465,I‚Äôm finally surfacing after a holiday to Scotl...,0
3466,Couldn't be prouder today. Well done to every ...,0


In [2]:
df.isna().sum() # missing (NaN) values ‡¶•‡¶æ‡¶ï‡¶æ row ‡¶ó‡ßÅ‡¶≤‡ßã ‡¶°‡¶ø‡¶≤‡¶ø‡¶ü ‡¶ï‡¶∞‡ßá ‡¶¶‡ßá‡ßü‡•§

tweet        1
sarcastic    0
dtype: int64

In [3]:
df.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace=True)


In [4]:
df.isna().sum()

tweet        0
sarcastic    0
dtype: int64

In [5]:
df.isna().any()

tweet        False
sarcastic    False
dtype: bool

In [6]:
# Check if entire DataFrame has any NaN
df.isna().any().any()

False

In [7]:
#Row-wise Nan Check
df.isna().any(axis=1)

0       False
1       False
2       False
3       False
4       False
        ...  
3463    False
3464    False
3465    False
3466    False
3467    False
Length: 3467, dtype: bool

# Data Cleaning 


In [8]:
import re
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from contractions import fix

# Download required resources
nltk.download('punkt')       # Tokenizer
nltk.download('stopwords')   # Stopwords
nltk.download('wordnet')     # Lemmatizer dictionary

# Default stopwords
stop_words = set(stopwords.words('english'))
#add stop words 
stop_words.add('us')

# Custom slang/unwanted words
custom_slags = {'u', 'ur', 'r', 'lol', 'omg', 'ummm'}

# Combine both
all_stopwords = stop_words.union(custom_slags)

# Lemmatizer init
lemmatizer = WordNetLemmatizer()

def clean_text_advanced(text):
    text = fix(text)  # Expand contractions
    text = text.lower()  # Lowercase
    text = re.sub(r'\S+@\S+', '', text)  # Remove email addresses
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'<.*?>', '', text)    # Remove HTML tags
    text = re.sub(r'[^\x00-\x7F]+', '', text)  # Remove emojis / non-ASCII
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation

    tokens = word_tokenize(text)  # Tokenization

    # Remove stopwords & apply lemmatization
    clean_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in all_stopwords and word.isalpha()]
    
    return " ".join(clean_tokens)  # Return cleaned string

# Example text
example_text = """
Wow!!! I can't believe THIS is happening... üòÇüòÇ  
Ummm... well, it's not like I didn't see it coming, right???  
Anyway, let's just move on. #sarcasm #irony  
Visit: https://example.com or email me at test@mail.com.  
"""

# Run cleaning
print(clean_text_advanced(example_text))


[nltk_data] Downloading package punkt to /home/catpc/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/catpc/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/catpc/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


wow believe happening well like see coming right anyway let move sarcasm irony visit email


In [9]:
df.head()

Unnamed: 0,tweet,sarcastic
0,The only thing I got from college is a caffein...,1
1,I love it when professors draw a big question ...,1
2,Remember the hundred emails from companies whe...,1
3,Today my pop-pop told me I was not ‚Äúforced‚Äù to...,1
4,@VolphanCarol @littlewhitty @mysticalmanatee I...,1


# clean_text_advanced is apply  in tweet column 

In [10]:
df.loc[:, 'tweet'] = df['tweet'].apply(clean_text_advanced)


In [11]:
df.head()

Unnamed: 0,tweet,sarcastic
0,thing got college caffeine addiction,1
1,love professor draw big question mark next ans...,1
2,remember hundred email company covid started g...,1
3,today poppop told forced go college okay sure ...,1
4,volphancarol littlewhitty mysticalmanatee also...,1


In [12]:
df['sarcastic'].value_counts()

sarcastic
0    2600
1     867
Name: count, dtype: int64

# Data Balancing


In [13]:
from imblearn.over_sampling import RandomOverSampler

#splitting the data into featture and lebels
x = df['tweet'].values.reshape(-1,1) #reshaping for the oversampling
y = df['sarcastic']

#applying Random Oversampling
oversampler = RandomOverSampler(random_state=42)
x_balanced, y_balanced = oversampler.fit_resample(x,y)


#create a balanced dataframe
df = pd.DataFrame({'tweet':x_balanced.flatten(), 'sarcastic':y_balanced})


#check the new class distribution 
df['sarcastic'].value_counts()


sarcastic
1    2600
0    2600
Name: count, dtype: int64

# Vectorization and train test split

In [14]:
#Tf-Idf vectorization
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split


tfidf = TfidfVectorizer()
X =tfidf.fit_transform(df['tweet'])

y = df['sarcastic']


#  Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [15]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler(with_mean=False) #with_mean=False due to sparse matrix
x_train = scaler.fit_transform(X_train)
x_test = scaler.transform(X_test)




In [16]:
# 3. Train SVM Model
from sklearn.svm import SVC 
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

model = SVC(kernel='linear')  # Linear kernel is common for text classification
model.fit(X_train, y_train)

In [17]:
#predictions
y_pred = model.predict(X_test)

In [18]:
#evalutae the model
accuracy = accuracy_score(y_test,y_pred)
classification_rep = classification_report(y_test,y_pred)
conf_matrix = confusion_matrix(y_test,y_pred)


# Display results
print(f"Accuracy: {accuracy * 100:.2f}%")
print("Classification Report:\n", classification_rep)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 84.13%
Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.83      0.83       493
           1       0.85      0.85      0.85       547

    accuracy                           0.84      1040
   macro avg       0.84      0.84      0.84      1040
weighted avg       0.84      0.84      0.84      1040

Confusion Matrix:
 [[408  85]
 [ 80 467]]


In [19]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=200, random_state=42)
rf_model.fit(X_train, y_train)

y_pred_rf = rf_model.predict(X_test)

accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Random Forest Accuracy: {accuracy_rf * 100:.2f}%")
print("Classification Report (Random Forest):\n", classification_report(y_test, y_pred_rf))
print("Confusion Matrix (Random Forest):\n", confusion_matrix(y_test, y_pred_rf))

Random Forest Accuracy: 91.06%
Classification Report (Random Forest):
               precision    recall  f1-score   support

           0       0.89      0.93      0.91       493
           1       0.93      0.90      0.91       547

    accuracy                           0.91      1040
   macro avg       0.91      0.91      0.91      1040
weighted avg       0.91      0.91      0.91      1040

Confusion Matrix (Random Forest):
 [[457  36]
 [ 57 490]]


In [33]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, classification_report

models = {
    'Logistic Regression': LogisticRegression(),
    'SVM': SVC(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'Naive Bayes': MultinomialNB()
}

for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    
    print(f"Model: {name}")
    print(accuracy)
    print(confusion_matrix(y_test, preds))
    print(classification_report(y_test, preds))
    print("-" * 30)



Model: Logistic Regression
0.8413461538461539
[[399  94]
 [113 434]]
              precision    recall  f1-score   support

           0       0.78      0.81      0.79       493
           1       0.82      0.79      0.81       547

    accuracy                           0.80      1040
   macro avg       0.80      0.80      0.80      1040
weighted avg       0.80      0.80      0.80      1040

------------------------------
Model: SVM
0.8413461538461539
[[472  21]
 [ 76 471]]
              precision    recall  f1-score   support

           0       0.86      0.96      0.91       493
           1       0.96      0.86      0.91       547

    accuracy                           0.91      1040
   macro avg       0.91      0.91      0.91      1040
weighted avg       0.91      0.91      0.91      1040

------------------------------
Model: Random Forest
0.8413461538461539
[[451  42]
 [ 59 488]]
              precision    recall  f1-score   support

           0       0.88      0.91      0.90 

# sarcasm Detection System Function 

In [22]:
def detect_sarcasm(new_headline):
    cleaned_headline = clean_text_advanced(new_headline)
    transformed_headline = tfidf.transform([cleaned_headline])
    prediction = rf_model.predict(transformed_headline)
    
    return prediction[0]

In [28]:
test_headline = "Oh great, another meeting that could‚Äôve been an email‚Äîjust what I needed to spice up my day."
detect_sarcasm(test_headline)

1

In [29]:
test_headline ="Keep talking, I love the sound of nonsense in the morning."
detect_sarcasm(test_headline)

1

In [31]:
test_headline ="I really appreciate your help."
detect_sarcasm(test_headline)

0