<a href="https://colab.research.google.com/github/abhineetbhardwaj/Bharat-intern/blob/main/Untitled12.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
file_path = '/content/HateSpeechData.csv'
data = pd.read_csv(file_path)

# Preprocess the data: Combine 'hate_speech' and 'offensive_language' into one binary label
data['label'] = data['class'].apply(lambda x: 1 if x == 0 else (2 if x == 1 else 0))

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data['tweet'], data['label'], test_size=0.2, random_state=42)

# Create a pipeline to vectorize the text data and train a logistic regression model
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('clf', LogisticRegression(solver='lbfgs', max_iter=1000))
])

# Train the model
pipeline.fit(X_train, y_train)

# Evaluate the model
y_pred = pipeline.predict(X_test)
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(classification_report(y_test, y_pred))

# Function to predict and raise a flag if the rating exceeds 0.68
def predict_hate_speech(text):
    prob = pipeline.predict_proba([text])[0]
    rating = max(prob)
    prediction = pipeline.predict([text])[0]
    if rating > 0.68:
        flag = True
    else:
        flag = False
    return {'prediction': prediction, 'rating': rating, 'flag': flag}

# Example usage
example_text = "I will kill you"
result = predict_hate_speech(example_text)
print(result)


Accuracy: 0.891466612870688
              precision    recall  f1-score   support

           0       0.84      0.82      0.83       835
           1       0.52      0.17      0.26       290
           2       0.91      0.96      0.94      3832

    accuracy                           0.89      4957
   macro avg       0.75      0.65      0.67      4957
weighted avg       0.88      0.89      0.88      4957

{'prediction': 1, 'rating': 0.6630989310130573, 'flag': False}


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, roc_curve, roc_auc_score
import numpy as np

# Load the dataset
file_path = '/content/HateSpeechData.csv'
data = pd.read_csv(file_path)

# Preprocess the data: Combine 'hate_speech' and 'offensive_language' into one binary label
data['label'] = data['class'].apply(lambda x: 1 if x == 0 else (2 if x == 1 else 0))

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data['tweet'], data['label'], test_size=0.2, random_state=42)

# Create a pipeline to vectorize the text data and train a logistic regression model
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('clf', LogisticRegression(solver='lbfgs', max_iter=1000, multi_class='ovr'))
])

# Train the model
pipeline.fit(X_train, y_train)

# Evaluate the model
y_pred = pipeline.predict(X_test)
y_prob = pipeline.predict_proba(X_test)

print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(classification_report(y_test, y_pred))

# Calculate ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, y_prob[:, 1], pos_label=1)
roc_auc = roc_auc_score(y_test, y_prob, multi_class='ovr')
print(f'ROC AUC: {roc_auc}')

# Find the optimal threshold
optimal_idx = np.argmax(tpr - fpr)
optimal_threshold = thresholds[optimal_idx]
print(f'Optimal Threshold: {optimal_threshold}')

# Function to predict and raise a flag if the rating exceeds the optimal threshold
def predict_hate_speech(text):
    prob = pipeline.predict_proba([text])[0]
    rating = max(prob)
    prediction = pipeline.predict([text])[0]
    if rating > optimal_threshold:
        flag = True
    else:
        flag = False
    return {'prediction': prediction, 'rating': rating, 'flag': flag}

# Example usage
example_text = "i will kill you"
result = predict_hate_speech(example_text)
print(result)


Accuracy: 0.8838006858987291
              precision    recall  f1-score   support

           0       0.84      0.77      0.80       835
           1       0.51      0.13      0.21       290
           2       0.90      0.97      0.93      3832

    accuracy                           0.88      4957
   macro avg       0.75      0.62      0.65      4957
weighted avg       0.87      0.88      0.87      4957

ROC AUC: 0.9284780163501455
Optimal Threshold: 0.05850829165662622
{'prediction': 1, 'rating': 0.5421227134261576, 'flag': True}


In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_curve

# Load the dataset
file_path = '/content/HateSpeechData.csv'  # Update with your file path
data = pd.read_csv(file_path)

# Preprocess the data
data['label'] = data['class'].apply(lambda x: 1 if x == 0 else (2 if x == 1 else 0))
texts = data['tweet'].values
labels = data['label'].values

# Tokenize and pad the sequences
max_words = 10000
max_len = 100
tokenizer = Tokenizer(num_words=max_words, lower=True)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
padded_sequences = pad_sequences(sequences, maxlen=max_len)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)

# Build the model
model = Sequential([
    Embedding(max_words, 128, input_length=max_len),
    Bidirectional(LSTM(64, return_sequences=True)),
    Dropout(0.5),
    Bidirectional(LSTM(64)),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(3, activation='softmax')  # 3 classes: hate speech, offensive language, neither
])

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=30, batch_size=32, validation_split=0.2)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Loss: {loss}')
print(f'Accuracy: {accuracy}')

# Make predictions
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)

# Classification report
report = classification_report(y_test, y_pred_classes, target_names=['neither', 'hate_speech', 'offensive_language'])
print(report)

# Determine the optimal threshold using ROC curve
fpr = {}
tpr = {}
thresholds = {}

for i in range(3):
    fpr[i], tpr[i], thresholds[i] = roc_curve(y_test, y_pred[:, i], pos_label=i)

# Find the optimal threshold for each class
optimal_thresholds = {}
for i in range(3):
    optimal_idx = np.argmax(tpr[i] - fpr[i])
    optimal_thresholds[i] = thresholds[i][optimal_idx]

optimal_threshold = max(optimal_thresholds.values())
print(f'Optimal Threshold: {optimal_threshold}')

# Function to predict and raise a flag if the rating exceeds the optimal threshold
def predict_content(text, threshold=optimal_threshold):
    sequence = tokenizer.texts_to_sequences([text])
    padded_sequence = pad_sequences(sequence, maxlen=max_len)
    prob = model.predict(padded_sequence)[0]
    rating = max(prob)
    prediction = np.argmax(prob)
    if rating > threshold:
        flag = True
    else:
        flag = False
    return {'prediction': prediction, 'rating': rating, 'flag': flag}

# Example usage
example_text = "This is an example tweet."
result = predict_content(example_text)
print(result)


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Loss: 1.5556929111480713
Accuracy: 0.8612063527107239
                    precision    recall  f1-score   support

           neither       0.85      0.71      0.77       835
       hate_speech       0.30      0.27      0.29       290
offensive_language       0.90      0.94      0.92      3832

          accuracy                           0.86      4957
         macro avg       0.68      0.64      0.66      4957
      weighted avg       0.86      0.86      0.86      4957

Optimal Threshold: 0.9999967813491821
{'prediction': 1, 'rating': 0.9991097, 'flag': False}


In [None]:
example_text = "I like you"
result = predict_content(example_text)
print(result)

{'prediction': 0, 'rating': 0.37846184, 'flag': False}


In [None]:
example_text = "I dont like you"
result = predict_content(example_text)
print(result)

{'prediction': 0, 'rating': 0.6263757, 'flag': False}


In [None]:
example_text = "I will kill you"
result = predict_content(example_text)
print(result)

{'prediction': 1, 'rating': 0.5329239, 'flag': False}


In [None]:
example_text = "I will fuck you"
result = predict_content(example_text)
print(result)

NameError: name 'predict_content' is not defined

In [None]:
example_text = "I will fuck you up"
result = predict_content(example_text)
print(result)

{'prediction': 2, 'rating': 0.93917227, 'flag': True}


In [None]:
example_text = "you are so fat"
result = predict_content(example_text)
print(result)

{'prediction': 1, 'rating': 0.5573462, 'flag': False}


In [None]:
example_text = "just go and jump off a cliff you fat fucker"
result = predict_content(example_text)
print(result)

{'prediction': 2, 'rating': 0.98391956, 'flag': True}
