In [1]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
import pandas as pd

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import *
from nltk.stem import WordNetLemmatizer

import string
import nltk
#nltk.download('stopwords')
nltk.download('wordnet')
pd.set_option('display.max_columns', None)



df = pd.read_csv('labeled_data.csv')



[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ahmed\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Preprocessing


In [2]:
#check null values
df.isnull().sum() 

Unnamed: 0            0
count                 0
hate_speech           0
offensive_language    0
neither               0
class                 0
tweet                 0
dtype: int64

In [3]:

stopwords = nltk.corpus.stopwords.words("english")

#extending the stopwords to include other words used in twitter such as retweet(rt) etc.
other_exclusions = ["#ff", "ff", "rt" , "RT"]
stopwords.extend(other_exclusions)
lemmatizer = WordNetLemmatizer()

def preprocess(tweet):  
    # removal of extra spaces
    regex_pat = re.compile(r'\s+')
    tweet_space = tweet.str.replace(regex_pat, ' ' , regex=True)

    # removal of @name[mention]
    regex_pat = re.compile(r'@[\w\-]+')
    tweet_name = tweet_space.str.replace(regex_pat, '' , regex=True)

    # removal of links[https://abc.com]
    giant_url_regex =  re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
            r'[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    tweets = tweet_name.str.replace(giant_url_regex, '', regex=True)
    
    # removal of punctuations and numbers
    punc_remove = tweets.str.replace(r"[^a-zA-Z]", " ", regex=True)
    # remove whitespace with a single space
    newtweet=punc_remove.str.replace(r'\s+', ' ', regex=True)
    # remove leading and trailing whitespace
    newtweet=newtweet.str.replace(r'^\s+|\s+?$','', regex=True)
    # replace normal numbers with numbr
    newtweet=newtweet.str.replace(r'\d+(\.\d+)?','numbr', regex=True)
    # removal of capitalization
    tweet_lower = newtweet.str.lower()
    
    # tokenizing
    tokenized_tweet = tweet_lower.apply(lambda x: x.split())
    
    # removal of stopwords
    tokenized_tweet=  tokenized_tweet.apply(lambda x: [item for item in x if item not in stopwords])
    
    # stemming of the tweets
   # tokenized_tweet = tokenized_tweet.apply(lambda x: [lemmatizer.lemmatize(i) for i in x])
    
    for i in range(len(tokenized_tweet)):
        tokenized_tweet[i] = ' '.join(tokenized_tweet[i])
        tweets_p= tokenized_tweet
    
    return tweets_p
    
processed_tweets = preprocess(df.tweet)   

df['processed_tweets'] = processed_tweets
print(df[["tweet","processed_tweets"]].head(10))


                                               tweet  \
0  !!! RT @mayasolovely: As a woman you shouldn't...   
1  !!!!! RT @mleew17: boy dats cold...tyga dwn ba...   
2  !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...   
3  !!!!!!!!! RT @C_G_Anderson: @viva_based she lo...   
4  !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...   
5  !!!!!!!!!!!!!!!!!!"@T_Madison_x: The shit just...   
6  !!!!!!"@__BrighterDays: I can not just sit up ...   
7  !!!!&#8220;@selfiequeenbri: cause I'm tired of...   
8  " &amp; you might not get ya bitch back &amp; ...   
9  " @rhythmixx_ :hobbies include: fighting Maria...   

                                    processed_tweets  
0  woman complain cleaning house amp man always t...  
1  boy dats cold tyga dwn bad cuffin dat hoe st p...  
2       dawg ever fuck bitch start cry confused shit  
3                                   look like tranny  
4     shit hear might true might faker bitch told ya  
5  shit blows claim faithful somebody still fucki... 

In [4]:
# Assume 'df' is your DataFrame with the dataset
X = df[['processed_tweets']]
y = df['class']

# Split the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.33, random_state=42)



# Convert text to numerical features using TF-IDF
tfidf_vectorizer = TfidfVectorizer( max_features=10000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train['processed_tweets'])
X_val_tfidf = tfidf_vectorizer.transform(X_val['processed_tweets'])

X_train_final = X_train_tfidf
X_val_final = X_val_tfidf
# Train a model (e.g., RandomForestClassifier)
clf =  LogisticRegression(C = 4, random_state= 42)
clf.fit(X_train_final, y_train)

# Make predictions
predictions = clf.predict(X_val_final)

# Evaluate the model
accuracy = accuracy_score(y_val, predictions)
print(f"Accuracy: {accuracy}")
print(classification_report(y_val, predictions))

Accuracy: 0.8958307861596773
              precision    recall  f1-score   support

           0       0.46      0.21      0.29       465
           1       0.92      0.96      0.94      6335
           2       0.85      0.84      0.84      1379

    accuracy                           0.90      8179
   macro avg       0.74      0.67      0.69      8179
weighted avg       0.88      0.90      0.89      8179



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [5]:
""" #find best c parameter with cross validation
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [0.001, 0.01, 0, 0.1, 1, 10, 100]}
grid_search = GridSearchCV(LogisticRegression(random_state=42 , max_iter= 10000), param_grid, cv=5 )
grid_search.fit(X_train_final, y_train)

best_params = grid_search.best_params_
print(f"Best Hyperparameters: {best_params}") """

' #find best c parameter with cross validation\nfrom sklearn.model_selection import GridSearchCV\nparam_grid = {\'C\': [0.001, 0.01, 0, 0.1, 1, 10, 100]}\ngrid_search = GridSearchCV(LogisticRegression(random_state=42 , max_iter= 10000), param_grid, cv=5 )\ngrid_search.fit(X_train_final, y_train)\n\nbest_params = grid_search.best_params_\nprint(f"Best Hyperparameters: {best_params}") '

In [21]:
# Example sentence
new_sentence = pd.Series('women')

new_sentence = preprocess(new_sentence)
print(new_sentence)


# Convert the new sentence to TF-IDF features
new_sentence_tfidf = tfidf_vectorizer.transform([new_sentence[0]])



# Make predictions for the new sentence
prediction = clf.predict(new_sentence_tfidf)

# Print the prediction
classifciationType = ''
if(prediction[0] == 0):
    classifciationType = 'Hate Speech'
elif(prediction[0] == 1): 
        classifciationType = 'Offesnive'
elif(prediction[0]==2):
        classifciationType = 'Neither'


print(f"The prediction for the sentence is: {classifciationType}")

0    women
dtype: object
The prediction for the sentence is: Offesnive
