**IMPORT DATASET**:


In [13]:
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/DATASET/cyberbully_data.csv')
print(df.head())
print(df.info())

   Unnamed: 0  count  hate_speech  offensive_language  neither  class  \
0           0      3            0                   0        3      2   
1           1      3            0                   3        0      1   
2           2      3            0                   3        0      1   
3           3      3            0                   2        1      1   
4           4      6            0                   6        0      1   

                                               tweet  
0  !!! RT @mayasolovely: As a woman you shouldn't...  
1  !!!!! RT @mleew17: boy dats cold...tyga dwn ba...  
2  !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...  
3  !!!!!!!!! RT @C_G_Anderson: @viva_based she lo...  
4  !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24783 entries, 0 to 24782
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Unnamed: 0          24

**DATA PREPROCESSING**:



In [14]:
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

nltk.download('stopwords')
nltk.download('wordnet')

print("NLTK stopwords and wordnet data downloaded and libraries imported.")

NLTK stopwords and wordnet data downloaded and libraries imported.


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


**CONVERT TEXT TO LOWER CASE**:



In [15]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower() # Convert to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text) # Remove special characters and numbers
    words = text.split() # Tokenize
    words = [word for word in words if word not in stop_words] # Remove stopwords
    words = [lemmatizer.lemmatize(word) for word in words] # Lemmatize
    return ' '.join(words)

df['cleaned_tweet'] = df['tweet'].apply(preprocess_text)

print(df[['tweet', 'cleaned_tweet']].head())
print("Text preprocessing complete and 'cleaned_tweet' column added.")

                                               tweet  \
0  !!! RT @mayasolovely: As a woman you shouldn't...   
1  !!!!! RT @mleew17: boy dats cold...tyga dwn ba...   
2  !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...   
3  !!!!!!!!! RT @C_G_Anderson: @viva_based she lo...   
4  !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...   

                                       cleaned_tweet  
0  rt mayasolovely woman shouldnt complain cleani...  
1  rt mleew boy dat coldtyga dwn bad cuffin dat h...  
2  rt urkindofbrand dawg rt sbabylife ever fuck b...  
3           rt cganderson vivabased look like tranny  
4  rt shenikaroberts shit hear might true might f...  
Text preprocessing complete and 'cleaned_tweet' column added.


**TARGET LABELS FOR MODEL TRAINING**:



In [16]:
print(df['class'].value_counts())
print(df['class'].dtype)

print("Target labels in 'class' column are already numerical and ready for model training.")

class
1    19190
2     4163
0     1430
Name: count, dtype: int64
int64
Target labels in 'class' column are already numerical and ready for model training.


## Feature Engineering




**TRAINING AND TESTING**:



In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# Initialize TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000) # Limiting features to 5000 to manage dimensionality

# Fit and transform the 'cleaned_tweet' column
X = tfidf_vectorizer.fit_transform(df['cleaned_tweet'])

# Define target variable 'y'
y = df['class']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("TF-IDF vectorization complete. Data split into training and testing sets.")
print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of X_test: {X_test.shape}")
print(f"Shape of y_train: {y_train.shape}")
print(f"Shape of y_test: {y_test.shape}")

TF-IDF vectorization complete. Data split into training and testing sets.
Shape of X_train: (19826, 5000)
Shape of X_test: (4957, 5000)
Shape of y_train: (19826,)
Shape of y_test: (4957,)


## Train and Evaluate Model




**Using Logistic Regression **:



In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Instantiate Logistic Regression model
log_reg_model = LogisticRegression(max_iter=1000, random_state=42)

# Train the model
log_reg_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = log_reg_model.predict(X_test)

# Calculate and print evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

# Print detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("Logistic Regression model trained and evaluated.")

Accuracy: 0.8959
Precision: 0.8824
Recall: 0.8959
F1-score: 0.8829

Classification Report:
              precision    recall  f1-score   support

           0       0.57      0.19      0.28       290
           1       0.91      0.96      0.94      3832
           2       0.84      0.83      0.84       835

    accuracy                           0.90      4957
   macro avg       0.78      0.66      0.69      4957
weighted avg       0.88      0.90      0.88      4957

Logistic Regression model trained and evaluated.


In [19]:
import pickle

# Save the trained Logistic Regression model
with open('logistic_regression_model.pkl', 'wb') as file:
    pickle.dump(log_reg_model, file)
print("Logistic Regression model saved as 'logistic_regression_model.pkl'")

# Save the fitted TF-IDF vectorizer
with open('tfidf_vectorizer.pkl', 'wb') as file:
    pickle.dump(tfidf_vectorizer, file)
print("TF-IDF vectorizer saved as 'tfidf_vectorizer.pkl'")

Logistic Regression model saved as 'logistic_regression_model.pkl'
TF-IDF vectorizer saved as 'tfidf_vectorizer.pkl'
