In [2]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd


df = pd.read_csv('labeled_data.csv')
# Assume 'df' is your DataFrame with the dataset
X = df[['tweet', 'count', 'offensive_language', 'hate_speech' , 'neither']]
y = df['class']

# Split the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

df['tweet'] = df['tweet'].apply(lambda x: x.lower())  # Convert to lowercase
df['tweet'] = df['tweet'].replace('[^a-zA-Z0-9]', ' ', regex=True)  # Remove special characters

# Convert text to numerical features using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train['tweet'])
X_val_tfidf = tfidf_vectorizer.transform(X_val['tweet'])

# Concatenate TF-IDF features with other numerical features
X_train_final = X_train[['count', 'offensive_language', 'hate_speech','neither']].values
X_train_final = hstack([X_train_tfidf, X_train_final])
X_val_final = X_val[['count', 'offensive_language', 'hate_speech','neither']].values
X_val_final = hstack([X_val_tfidf, X_val_final])

# Train a model (e.g., RandomForestClassifier)
clf = RandomForestClassifier()
clf.fit(X_train_final, y_train)

# Make predictions
predictions = clf.predict(X_val_final)

# Evaluate the model
accuracy = accuracy_score(y_val, predictions)
print(f"Accuracy: {accuracy}")
print(classification_report(y_val, predictions))


Accuracy: 0.998587855557797
              precision    recall  f1-score   support

           0       1.00      0.98      0.99       290
           1       1.00      1.00      1.00      3832
           2       1.00      1.00      1.00       835

    accuracy                           1.00      4957
   macro avg       1.00      0.99      1.00      4957
weighted avg       1.00      1.00      1.00      4957

