In [None]:
! pip install transformers

In [None]:
import numpy as np
import pandas as pd 
import nltk
import matplotlib.pyplot as plt
from wordcloud import WordCloud,STOPWORDS
from PIL import Image
from sklearn.utils import resample
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.probability import FreqDist
from transformers import RobertaTokenizer
from transformers import TFRobertaModel
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Dense,Dropout
from tensorflow.keras.layers import Flatten
from sklearn.metrics import precision_recall_curve

import tensorflow as tf
from sklearn.metrics import roc_auc_score, roc_curve,auc
from keras import regularizers
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix,f1_score,classification_report
from mlxtend.plotting import plot_confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import label_binarize
from sklearn.metrics import average_precision_score
from itertools import cycle

lemma = WordNetLemmatizer()
stopword = set(STOPWORDS)
nltk.download('omw-1.4')
%matplotlib inline
import nltk
nltk.download('stopwords')
import nltk
nltk.download('wordnet')


from nltk.corpus import brown
nltk.download("brown")
import warnings
warnings.filterwarnings("ignore")


# Read Train Data

In [None]:
train_data=pd.read_csv("/kaggle/input/twitter-sentiment-analysis-hatred-speech/train.csv")

# Read Test Data

In [None]:
test_data=pd.read_csv("/kaggle/input/twitter-sentiment-analysis-hatred-speech/test.csv")

# Read Wordcloud Image

In [None]:
wordcloud_mask=np.array(Image.open("/kaggle/input/wodcloud-twiter-pic/twitter.png"))




```
* 0==Positive Data
* 1==Racist & Sexist Data
```



In [None]:
train_data.head()

In [None]:
test_data.head()

In [None]:
train_data=train_data.drop(["id"],axis=1)

In [None]:
train_data.head()

In [None]:
new_column=["tweet","label"]
train_data=train_data[new_column]
train_data=train_data.reindex(columns=new_column)

In [None]:
train_data.head()

In [None]:
train_data['label'].value_counts()

# Racist Data Vs Positive Data

In [None]:
plt.figure(figsize=(10,10))
sns.countplot(data=train_data, x='label', hue='label')
plt.title("Counting Racist & Sexist Vs Non Racist  Data\n",fontsize=20,color="deepskyblue")
plt.xlabel('Tweets')
plt.ylabel("Count")
plt.show()

In [None]:
train_data_len = train_data['tweet'].str.len()
test_data_len = test_data['tweet'].str.len()

In [None]:
print("Train Text Data Length is",train_data_len)

In [None]:
print("Test Text Data Length is",test_data_len)

# Train Data Length Vs Test Data Length

In [None]:
plt.figure(figsize=(10,10))
plt.hist(train_data_len, bins=20,label='train_data')
plt.hist(test_data_len , bins=20, label='test_data')
plt.title("Counting Train Data Text  Vs Test Data Text Length\n",fontsize=18,color="darkgoldenrod")
plt.legend()
plt.show()

# All Tweet Text Data WordCloud

In [None]:
all_tweet_text=" ".join(train_data["tweet"].tolist())
plt.figure(figsize=(10,10))
wordcloud=WordCloud(stopwords=stopword,width=1600,height=800,max_words=100,mask=wordcloud_mask,colormap='RdYlGn').generate(all_tweet_text)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad=0)
plt.imshow(wordcloud)
plt.show()

# Racist_Sexist_Data Wordcloud

In [None]:
racist_sexist_data=train_data[train_data.label==1]
racist_tweet_text=" ".join(racist_sexist_data["tweet"].tolist())
plt.figure(figsize=(10,10))
wordcloud=WordCloud(stopwords=stopword,width=1600,height=800,max_words=100,mask=wordcloud_mask,colormap='Paired').generate(racist_tweet_text)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad=0)
plt.imshow(wordcloud)
plt.show()

# Positive Data Wordcloud

In [None]:
positive_data=train_data[train_data.label==0]
positive_tweet_text=" ".join(positive_data["tweet"].tolist())
plt.figure(figsize=(10,10))
wordcloud=WordCloud(stopwords=stopword,width=1600,height=800,max_words=100,mask=wordcloud_mask,colormap='Dark2').generate(positive_tweet_text)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad=0)
plt.imshow(wordcloud)
plt.show()

# Most Frequent Words In Train Data

In [None]:
plt.figure(figsize=(20,20))
count_vectorizer=CountVectorizer(stop_words="english")
word=count_vectorizer.fit_transform(train_data.tweet)
sum_words=word.sum(axis=0)
most_freq_words=[(all_word,sum_words[0,i]) for all_word,i in count_vectorizer.vocabulary_.items()]
most_freq_words=sorted(most_freq_words,key=lambda x:x[1],reverse=True)

frequent_words=pd.DataFrame(most_freq_words,columns=["Words","Most_Frequent_Words"])

word_list = list(frequent_words.head(30).Words)
word_value_list = list(frequent_words.head(30).Most_Frequent_Words)

plt.bar(word_list, word_value_list, color ="limegreen",width=0.5)
plt.xlabel("\nWords",fontsize=15,color="darkorange")
plt.ylabel("Most_Frequent_Words\n",fontsize=15,color="darkorange")
plt.title("Most Frequent Words In Train Data\n",fontsize=20,color="forestgreen")
plt.tight_layout(pad=0)
plt.show()

In [None]:
train_data.head()

# Data Preprocessing And Cleaning

In [None]:
def clean_text(tweet):
    tweets = " ".join(filter(lambda x: x[0]!= '@' , tweet.split()))
    tweets = re.sub('[^a-zA-Z]', ' ', tweets)
    tweets = tweets.lower()
    tweets = tweets.split()
    tweets = [word for word in tweets if not word in set(stopwords.words('english'))]
    tweets = [lemma.lemmatize(word) for word in tweets]
    tweets = " ".join(tweets)
    return tweets

In [None]:
train_data['clean_tweet_text'] = train_data.tweet.apply(clean_text)

In [None]:
train_data.head()

# Extracting Hashtag Words

In [None]:
def extract_hashtag(tweet):
    tweets = " ".join(filter(lambda x: x[0]== '#', tweet.split()))
    tweets = re.sub('[^a-zA-Z]',' ',  tweets)
    tweets = tweets.lower()
    tweets = [lemma.lemmatize(word) for word in tweets]
    tweets = "".join(tweets)
    return tweets

In [None]:
train_data['word_with_hashtag'] = train_data.tweet.apply(extract_hashtag)

In [None]:
train_data.head()

# Most Common Racist and Sexist Words With  Hashtag 





In [None]:
racist_sexist_hashtag = FreqDist(list(" ".join(train_data[train_data['label']==1]['word_with_hashtag']).split())).most_common(15)
racist_sexist_data = pd.DataFrame(racist_sexist_hashtag, columns=['words', 'frequency'])
plt.figure(figsize=(20,20))
sns.barplot(x='words',y="frequency" ,data=racist_sexist_data,color="deepskyblue")

plt.title('Racist and Sexist Words with Hashtags\n',fontsize=20,color="darkorange")
plt.xlabel("\nWords",fontsize=20,color="darkorange")
plt.ylabel("Frequency\n",fontsize=20,color="darkorange")
plt.tight_layout(pad=0)
plt.show()

# Most Common Positive Words With Hashtag

In [None]:
positive_hashtag = FreqDist(list(" ".join(train_data[train_data['label']==0]['word_with_hashtag']).split())).most_common(15)
positive_data = pd.DataFrame(positive_hashtag, columns=['words', 'frequency'])
plt.figure(figsize=(20,20))
sns.barplot(x='words',y="frequency" ,data=positive_data,color="orangered")

plt.title('Racist and Sexist Words With Hashtag\n',fontsize=20,color="darkorange")
plt.xlabel("\nWords",fontsize=20,color="darkorange")
plt.ylabel("Frequency\n",fontsize=20,color="darkorange")
plt.tight_layout(pad=0)
plt.show()

In [None]:
train_data.head()

In [None]:
train_data=train_data.drop(["tweet","word_with_hashtag"],axis=1)

In [None]:
train_data.head()

In [None]:
max_len=128
text_data=train_data["clean_tweet_text"]
label_data=train_data["label"]

# Length of Text Data And Label Data

In [None]:
print("Length of Text Data :",len(text_data))
print("Length of Label Data :",len(label_data))

# roberta_tokenizer

In [None]:
roberta_tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

# roberta_model

In [None]:
roberta_model = TFRobertaModel.from_pretrained("roberta-base")

# Sample Text

In [None]:
text_data[1000]

In [None]:
encode_data=roberta_tokenizer.encode_plus(text_data[1000],add_special_tokens = True,max_length =40,pad_to_max_length = True,truncation=True)

In [None]:
encode_data

In [None]:
test_input_id=np.asarray(encode_data['input_ids'])
test_attention_mask=np.asarray(encode_data['attention_mask'])
output_data=roberta_model([test_input_id.reshape(1,-1),test_attention_mask.reshape(1,-1)])
type(output_data)

In [None]:
output_data

In [None]:
roberta_tokenizer.decode(encode_data["input_ids"])

In [None]:
final_text_data=train_data["clean_tweet_text"]
final_label_data=np.array(label_data)

In [None]:
def roberta_encode(final_text_data,max_len):
    input_ids=[]
    attention_masks=[]
    
    for i in range(len(final_text_data)):
        encode_data=roberta_tokenizer.encode_plus(final_text_data[i],add_special_tokens=True,max_length=max_len,pad_to_max_length=True,return_attention_mask=True)
        input_ids.append(encode_data['input_ids'])
        attention_masks.append(encode_data["attention_mask"])
    return np.array(input_ids),np.array(attention_masks)
        

In [None]:
text_input_ids,text_attention_masks = roberta_encode(final_text_data,max_len)

In [None]:
print('Text Input Ids Shape {} \nText Input Attention Mask Shape {} \nLabel Data shape {}'.format(text_input_ids.shape,text_attention_masks.shape,final_label_data.shape))

In [None]:
X_train_input,X_test_input,Y_train_label,Y_test_label,train_mask,test_mask=train_test_split(text_input_ids,final_label_data,text_attention_masks,test_size=0.2,random_state=42,shuffle=True)

In [None]:
print('Train input shape {}\nTest input shape {}\nTrain label shape {}\nTest label shape {}\nTrain attention mask shape {}\nTest attention mask shape {}'.format(X_train_input.shape,X_test_input.shape,Y_train_label.shape,Y_test_label.shape,train_mask.shape,test_mask.shape))

# Create Roberta Model

In [None]:
def Create_Roberta_Model():
    input_ids=tf.keras.Input(shape=(max_len,),dtype="int32")
    attention_masks=tf.keras.Input(shape=(max_len,),dtype="int32")
    
    roberta_model = TFRobertaModel.from_pretrained("roberta-base")
    output_dim = roberta_model(input_ids=input_ids, attention_mask=attention_masks)[0][:,0,:]
    
    dense_layer = tf.keras.layers.Dense(128, activation="relu")(output_dim)
    dropout = tf.keras.layers.Dropout(0.5)(dense_layer)
    final_layer = tf.keras.layers.Dense(2, activation='softmax')(dropout)

    model = tf.keras.models.Model(inputs=[input_ids, attention_masks], outputs=final_layer)
    
    return model


In [None]:
model=Create_Roberta_Model()
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False)

optimizer = tf.keras.optimizers.Adam(lr=1e-5)

model.compile(loss=loss,optimizer=optimizer, metrics=["accuracy"])


In [None]:
history=model.fit([X_train_input,train_mask],Y_train_label,batch_size=42,epochs=10,validation_data=([X_test_input,test_mask],Y_test_label))

In [None]:
def model_loss_and_accuracy(history):
    
    fig=plt.figure(figsize=(15,15))
    plt.subplot(221)
    plt.plot(history.history["accuracy"],marker="o",linestyle=":",markersize=10,color="m",label="accuracy")
    plt.plot(history.history["val_accuracy"],marker="D",linestyle=":",markersize=10,color="b",label="val_accuracy")
    plt.title("Model Accuracy\n",fontsize=20,color="darkorange")
    plt.xlabel("Number of Epochs",color="midnightblue",fontsize=15)
    plt.ylabel("Accuracy",color="midnightblue",fontsize=15)
    plt.grid(color = 'green', linestyle = '--', linewidth = 1)
    plt.legend(loc="best")
    plt.tight_layout()
    
  
    plt.subplot(222)
    plt.plot(history.history["accuracy"],marker="v",linestyle="-.",markersize=10,color="g",label="accuracy")
    plt.plot(history.history["val_accuracy"],marker="s",linestyle="--",markersize=10,color="r",label="val_accuracy")
    plt.title("Model Loss\n",fontsize=20,color="limegreen")
    plt.xlabel("Number of Epochs",color="midnightblue",fontsize=15)
    plt.ylabel("Loss",color="midnightblue",fontsize=15)
    plt.grid(color = 'green', linestyle = '--', linewidth = 1)
    plt.legend(loc="best")
    plt.tight_layout()
    plt.show()

In [None]:
model_loss_and_accuracy(history)

# F1 Score

In [None]:
def plot_f1_score(history):
    f1_scores = []
    for epoch in range(len(history.history['val_loss'])):
        model.set_weights(history.model.get_weights())
        predictions = model.predict([X_test_input, test_mask])
        predictions = [round(p[0]) for p in predictions]
    
        f1_scores.append(f1_score(Y_test_label, predictions, average='weighted'))
    plt.figure(figsize=(10,10))
    plt.plot(range(len(history.history['val_loss'])), f1_scores, label='F1 score',marker="*",color="g",linestyle="--",linewidth=4,markersize=8,markerfacecolor="r")
    plt.title("F1 Score\n",color="black",fontsize=20)
    plt.xlabel('Epochs',fontsize=15,color="black")
    plt.ylabel('F1 score',fontsize=15,color="black")
    plt.legend()
    plt.show()


plot_f1_score(history)


# Label Names

In [None]:
label_name=["Positive","Racist and Sexist"]

In [None]:
pred=model.predict([X_test_input, test_mask])
prediction=np.argmax(pred, axis=1)

# Accuracy Score

In [None]:
print("Accuracy Score is",accuracy_score(Y_test_label,prediction))

# ROC AUC Score

In [None]:
print("ROC AUC Score is {}".format(roc_auc_score(Y_test_label, pred[:,1])))

# Average Precision Score

In [None]:
# Get the predicted probabilities for the positive class
pred_probs = model.predict([X_test_input, test_mask])[:,1]

# Binarize the test labels
Y_test_binarized = (Y_test_label == 1).astype(int)

# Calculate the average precision score
average_precision = average_precision_score(Y_test_binarized, pred_probs)

print("Average Precision Score :", average_precision)


# receiver operating characteristic curve

In [None]:
pred_positive = pred[:,1]
fpr, tpr, thresholds = roc_curve(Y_test_label, pred_positive)
roc_auc = auc(fpr, tpr)
plt.figure(figsize=(10,10))
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc,color="g",linestyle="--",marker="o",markersize=3,markerfacecolor="k")

plt.plot([0, 1], [0, 1],linestyle="--",linewidth=3,color="m")  # random predictions curve
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate or (1 - Specifity)')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()



In [None]:
# Calculate precision, recall and thresholds
precision, recall, thresholds = precision_recall_curve(Y_test_label,pred[:,1])

# Plot the PR curve
plt.figure(figsize=(10,10))
plt.step(recall, precision, color='b', alpha=0.2, where='post')
plt.fill_between(recall, precision, step='post', alpha=0.2, color="darkmagenta")
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('Precision-Recall Curve\n',fontsize=20,color="black")
plt.xlabel("Recall",fontsize=15,color="violet")
plt.ylabel("Precision",fontsize=15,color="darkgreen")
plt.show()


In [None]:
# Encode the labels
label_encoder=LabelEncoder()
label_encoder.fit(Y_test_label)
test_label_data=label_encoder.transform(Y_test_label)
classes=label_encoder.classes_
colors = ["navy", "darkorange"]
plt.figure(figsize=(10,10))

# Compute the ROC curve and AUC for each class
fpr = {}
tpr = {}
roc_auc = dict()
for i, class_ in enumerate(classes):
    # Binarize the label data
    label_binarized = (test_label_data == i).astype(int)
    fpr[i], tpr[i], _ = roc_curve(label_binarized, pred[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

    # Plot the ROC curve for each class
    plt.plot(fpr[i], tpr[i], label='%s ROC Curve Class  (AUC=%0.2f)' % (class_, roc_auc[i]), marker=">", color=colors[i])

plt.plot([0,1],[0,1], marker="H", color="k", linestyle="--", linewidth=4)
plt.xlim([0,1])
plt.ylim([0,1.05])
plt.title('Compute The ROC Curve And AUC For Each Class', color="black", fontsize=15)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive rate')
plt.legend(loc='lower right')
plt.show()


# Compute The Precision Curve And Area Under Curve For Each Class

In [None]:
# Encode the labels
label_encoder=LabelEncoder()
label_encoder.fit(Y_test_label)
test_label_data=label_encoder.transform(Y_test_label)
classes=label_encoder.classes_
colors = ["darkred", "purple"]
plt.figure(figsize=(10,10))

# Compute the PR curve and AUC for each class
precision = {}
recall = {}
pr_auc = dict()
for i, class_ in enumerate(classes):
    # Binarize the label data
    label_binarized = (test_label_data == i).astype(int)
    precision[i], recall[i], _ = precision_recall_curve(label_binarized, pred[:, i])
    pr_auc[i] = auc(recall[i], precision[i])

    # Plot the PR curve for each class
    plt.plot(recall[i], precision[i], label='%s Precision Curve Class  (AUC=%0.2f)' % (class_, pr_auc[i]),color=colors[i],marker="D",markersize=2,linestyle="-.")

plt.xlim([0,1])
plt.ylim([0,1.05])
plt.title('Compute The Precision Curve And Area Under Curve For Each Class\n',color="darkblue",fontsize=20)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.legend(loc='lower right')
plt.show()

In [None]:
plt.figure(figsize=(10,10))

cf_matrix=confusion_matrix(Y_test_label,prediction)
label_counts = ["{0:0.0f}".format(value) for value in cf_matrix.flatten()]
label_percentages = ["{0:.2%}".format(value) for value in cf_matrix.flatten()/np.sum(cf_matrix)]
label_names = ["True Positive","False Positive","False Negative","True Negative"]
labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in zip(label_names,label_counts,label_percentages)]
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(cf_matrix, annot=labels, fmt="", cmap='gist_ncar',xticklabels=label_name,yticklabels=label_name)
plt.title("Confusion Matrix\n",color="gold",fontsize=20)
plt.ylabel("True Label\n",fontsize=15,color="yellow")
plt.xlabel("\nPredicted Label",fontsize=15,color="yellow")
plt.show()

In [None]:
print(classification_report(Y_test_label,prediction,target_names=label_name))

# Custom Data Prediction
> * 0==Positive
> * 1==Racist And Sexist

In [None]:
x="hank you very much for sharing your experience with us.We are really happy that your interaction with our brand was so positive.I just want to let you know that we are acting upon your feedback to make some vital changes to the way we operate [list of changes].As you can see, the opinions of our clients help us to provide better experiences and grow as a company.Regards,"
pred_input=roberta_tokenizer.encode_plus(x,add_special_tokens = True,max_length =128,pad_to_max_length = True,truncation=True)
test_input_id=np.array(pred_input['input_ids'])
test_input_mask=np.array(pred_input['attention_mask'])
# Get the predicted probabilities
text_predict = model.predict([test_input_id.reshape(1,-1), test_input_mask.reshape(1,-1)])

# Get the class with the highest probability
predicted_class = np.argmax(text_predict, axis=-1)[0]

# Check if the input is toxic (1) or positive (0)
if predicted_class == 1:
    print("The input text is toxic.")
else:
    print("The input text is positive.")
