# Data Loading 

In [1]:
import pandas as pd

data = pd.read_csv("/home/catpc/Downloads/Sarcasm.csv")
df = data[['tweet','sarcastic']]
df


Unnamed: 0,tweet,sarcastic
0,The only thing I got from college is a caffein...,1
1,I love it when professors draw a big question ...,1
2,Remember the hundred emails from companies whe...,1
3,Today my pop-pop told me I was not “forced” to...,1
4,@VolphanCarol @littlewhitty @mysticalmanatee I...,1
...,...,...
3463,The population spike in Chicago in 9 months is...,0
3464,You'd think in the second to last English clas...,0
3465,I’m finally surfacing after a holiday to Scotl...,0
3466,Couldn't be prouder today. Well done to every ...,0


In [2]:
df.isna().sum() # missing (NaN) values থাকা row গুলো ডিলিট করে দেয়।

tweet        1
sarcastic    0
dtype: int64

In [3]:
df.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace=True)


In [4]:
df.isna().sum()

tweet        0
sarcastic    0
dtype: int64

In [5]:
df.isna().any()

tweet        False
sarcastic    False
dtype: bool

In [6]:
# Check if entire DataFrame has any NaN
df.isna().any().any()

False

In [7]:
#Row-wise Nan Check
df.isna().any(axis=1)

0       False
1       False
2       False
3       False
4       False
        ...  
3463    False
3464    False
3465    False
3466    False
3467    False
Length: 3467, dtype: bool

# Data Cleaning 


In [37]:
import re
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from contractions import fix

# Download required resources
nltk.download('punkt')       # Tokenizer
nltk.download('stopwords')   # Stopwords
nltk.download('wordnet')     # Lemmatizer dictionary

# Default stopwords
stop_words = set(stopwords.words('english'))
#add stop words 
stop_words.add('us')

# Custom slang/unwanted words
custom_slags = {'u', 'ur', 'r', 'lol', 'omg', 'ummm'}

# Combine both
all_stopwords = stop_words.union(custom_slags)

# Lemmatizer init
lemmatizer = WordNetLemmatizer()

def clean_text_advanced(text):
    text = fix(text)  # Expand contractions
    text = text.lower()  # Lowercase
    text = re.sub(r'\S+@\S+', '', text)  # Remove email addresses
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'<.*?>', '', text)    # Remove HTML tags
    text = re.sub(r'[^\x00-\x7F]+', '', text)  # Remove emojis / non-ASCII
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation

    tokens = word_tokenize(text)  # Tokenization

    # Remove stopwords & apply lemmatization
    clean_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in all_stopwords and word.isalpha()]
    
    return " ".join(clean_tokens)  # Return cleaned string

# Example text
example_text = """
Wow!!! I can't believe THIS is happening... 😂😂  
Ummm... well, it's not like I didn't see it coming, right???  
Anyway, let's just move on. #sarcasm #irony  
Visit: https://example.com or email me at test@mail.com.  
"""

# Run cleaning
print(clean_text_advanced(example_text))


wow believe happening well like see coming right anyway let move sarcasm irony visit email


[nltk_data] Downloading package punkt to /home/catpc/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/catpc/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/catpc/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [38]:
df.head()

Unnamed: 0,tweet,sarcastic
0,The only thing I got from college is a caffein...,1
1,I love it when professors draw a big question ...,1
2,Remember the hundred emails from companies whe...,1
3,Today my pop-pop told me I was not “forced” to...,1
4,@VolphanCarol @littlewhitty @mysticalmanatee I...,1


# clean_text_advanced is apply  in tweet column 

In [44]:
df.loc[:, 'tweet'] = df['tweet'].apply(clean_text_advanced)


In [45]:
df.head()

Unnamed: 0,tweet,sarcastic
0,thing got college caffeine addiction,1
1,love professor draw big question mark next ans...,1
2,remember hundred email company covid started g...,1
3,today poppop told forced go college okay sure ...,1
4,volphancarol littlewhitty mysticalmanatee also...,1


Unnamed: 0,tweet,sarcastic
0,thing got college caffeine addiction,1
1,love professor draw big question mark next ans...,1
2,remember hundred email company covid started g...,1
3,today poppop told forced go college okay sure ...,1
4,volphancarol littlewhitty mysticalmanatee also...,1
...,...,...
3463,population spike chicago month ridiculous,0
3464,would think second last english class year pro...,0
3465,finally surfacing holiday scotland difficult d...,0
3466,could prouder today well done every student go...,0
