In [126]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import warnings 
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [127]:
train_df = pd.read_csv("train_tw.csv")
test_df = pd.read_csv("test_tw.csv")

In [128]:
train_df.head(10)

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation
5,6,0,[2/2] huge fan fare and big talking before the...
6,7,0,@user camping tomorrow @user @user @user @use...
7,8,0,the next school year is the year for exams.ð...
8,9,0,we won!!! love the land!!! #allin #cavs #champ...
9,10,0,@user @user welcome here ! i'm it's so #gr...


In [129]:
print("Training Data ")
print(train_df.info())
print("Test Data ")
print(test_df.info())

Training Data 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31962 entries, 0 to 31961
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      31962 non-null  int64 
 1   label   31962 non-null  int64 
 2   tweet   31962 non-null  object
dtypes: int64(2), object(1)
memory usage: 749.2+ KB
None
Test Data 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17197 entries, 0 to 17196
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      17197 non-null  int64 
 1   tweet   17197 non-null  object
dtypes: int64(1), object(1)
memory usage: 268.8+ KB
None


In [130]:
import re

def remove_pattern(tweet, pattern):
    r = re.findall(pattern, tweet)
    for i in r:
        tweet = re.sub(i, '', tweet)
    return tweet

In [131]:
pattern = r"@[\w]*"
remove_pattern = np.vectorize(remove_pattern)


In [132]:
train_df['tweet'] = train_df['tweet'].str.lower()
test_df['tweet'] = test_df['tweet'].str.lower()


In [133]:
train_df['tweet'] = remove_pattern(train_df['tweet'],pattern)
test_df['tweet'] = remove_pattern(test_df['tweet'], pattern)


In [134]:

def replace_special_characters(text):
    pattern = r"[^a-zA-Z]"
    
    cleaned_text = re.sub(pattern, ' ', text)
    
    return cleaned_text

train_df['tweet'] = train_df['tweet'].apply(replace_special_characters)
test_df['tweet'] = test_df['tweet'].apply(replace_special_characters)

In [135]:
train_df.head(10)

Unnamed: 0,id,label,tweet
0,1,0,when a father is dysfunctional and is so sel...
1,2,0,thanks for lyft credit i can t use cause th...
2,3,0,bihday your majesty
3,4,0,model i love u take with u all the time in ...
4,5,0,factsguide society now motivation
5,6,0,huge fan fare and big talking before the...
6,7,0,camping tomorrow danny
7,8,0,the next school year is the year for exams ...
8,9,0,we won love the land allin cavs champ...
9,10,0,welcome here i m it s so gr


In [136]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [140]:
train_tokens = [word_tokenize(tweet) for tweet in train_df['tweet']]
test_tokens = [word_tokenize(tweet) for tweet in test_df['tweet']]

In [141]:

train_df['tweet_token'] = train_df['tweet'].apply(word_tokenize)
train_df['preprocessed_text'] = train_df['tweet_token'].apply(lambda x: [word for word in x if not word in stop_words])

test_df['tweet_token'] = test_df['tweet'].apply(word_tokenize)
test_df['preprocessed_text'] = test_df['tweet_token'].apply(lambda x: [word for word in x if not word in stop_words])


In [142]:

from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

train_stemmed_tokens = [[stemmer.stem(token) for token in tweet_tokens] for tweet_tokens in train_tokens]

test_stemmed_tokens = [[stemmer.stem(token) for token in tweet_tokens] for tweet_tokens in test_tokens]


In [143]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

train_lemmatized_tokens = [[lemmatizer.lemmatize(token) for token in tweet_tokens] for tweet_tokens in train_tokens]
test_lemmatized_tokens = [[lemmatizer.lemmatize(token) for token in tweet_tokens] for tweet_tokens in test_tokens]


In [144]:
combined_df = pd.concat([train_df, test_df], axis=0)
combined_df['tweet_token'] = combined_df['tweet'].apply(word_tokenize)
combined_df['tweet_token_filtered'] = combined_df['tweet_token'].apply(lambda x: [word for word in x if not word in stop_words])
combined_df['tweet_stemmed'] = combined_df['tweet_token_filtered'].apply(lambda x: [stemmer.stem(token) for token in x])


In [145]:

all_words = ' '.join([text for tweet in combined_df['tweet_stemmed'] for text in tweet])
normal_words = ' '.join(combined_df.loc[combined_df['label'] == 0, 'tweet_stemmed'].apply(lambda tweet: ' '.join(tweet)))
hate_words = ' '.join(combined_df.loc[combined_df['label'] == 1, 'tweet_stemmed'].apply(lambda tweet: ' '.join(tweet)))



In [146]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

combined_df['tweet_lemmatized'] = combined_df['tweet_token_filtered'].apply(lambda x: [lemmatizer.lemmatize(token) for token in x])

all_words = ' '.join([text for tweet in combined_df['tweet_lemmatized'] for text in tweet])
normal_words = ' '.join(combined_df.loc[combined_df['label'] == 0, 'tweet_lemmatized'].apply(lambda tweet: ' '.join(tweet)))
negative_words = ' '.join(combined_df.loc[combined_df['label'] == 1, 'tweet_lemmatized'].apply(lambda tweet: ' '.join(tweet)))



In [147]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(train_df['tweet'])
y = train_df['label']

In [148]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [149]:
from sklearn.svm import SVC

model = SVC(kernel='linear')
model.fit(X_train, y_train)

accuracy = model.score(X_val, y_val)
print("Validation Accuracy:", accuracy)

Validation Accuracy: 0.9582355701548568


In [150]:
y_pred = model.predict(X_val)

print("f1 score :", f1_score(y_val, y_pred))
report = classification_report(y_val, y_pred)
print("Classification Report:")
print(report)


f1 score : 0.6147186147186147
Classification Report:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98      5937
           1       0.90      0.47      0.61       456

    accuracy                           0.96      6393
   macro avg       0.93      0.73      0.80      6393
weighted avg       0.96      0.96      0.95      6393

