In [188]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
import pandas as pd

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import *
import string
import nltk
#nltk.download('stopwords')
pd.set_option('display.max_columns', None)



df = pd.read_csv('labeled_data.csv')



# Preprocessing


In [189]:
#check null values
df.isnull().sum() 

Unnamed: 0            0
count                 0
hate_speech           0
offensive_language    0
neither               0
class                 0
tweet                 0
dtype: int64

In [190]:

stopwords = nltk.corpus.stopwords.words("english")

#extending the stopwords to include other words used in twitter such as retweet(rt) etc.
other_exclusions = ["#ff", "ff", "rt" , "RT"]
stopwords.extend(other_exclusions)
stemmer = PorterStemmer()

def preprocess(tweet):  
    # removal of extra spaces
    regex_pat = re.compile(r'\s+')
    tweet_space = tweet.str.replace(regex_pat, ' ' , regex=True)

    # removal of @name[mention]
    regex_pat = re.compile(r'@[\w\-]+')
    tweet_name = tweet_space.str.replace(regex_pat, '' , regex=True)

    # removal of links[https://abc.com]
    giant_url_regex =  re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
            r'[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    tweets = tweet_name.str.replace(giant_url_regex, '', regex=True)
    
    # removal of punctuations and numbers
    punc_remove = tweets.str.replace(r"[^a-zA-Z]", " ", regex=True)
    # remove whitespace with a single space
    newtweet=punc_remove.str.replace(r'\s+', ' ', regex=True)
    # remove leading and trailing whitespace
    newtweet=newtweet.str.replace(r'^\s+|\s+?$','', regex=True)
    # replace normal numbers with numbr
    newtweet=newtweet.str.replace(r'\d+(\.\d+)?','numbr', regex=True)
    # removal of capitalization
    tweet_lower = newtweet.str.lower()
    
    # tokenizing
    tokenized_tweet = tweet_lower.apply(lambda x: x.split())
    
    # removal of stopwords
    tokenized_tweet=  tokenized_tweet.apply(lambda x: [item for item in x if item not in stopwords])
    
    # stemming of the tweets
    tokenized_tweet = tokenized_tweet.apply(lambda x: [stemmer.stem(i) for i in x]) 
    
    for i in range(len(tokenized_tweet)):
        tokenized_tweet[i] = ' '.join(tokenized_tweet[i])
        tweets_p= tokenized_tweet
    
    return tweets_p
    
processed_tweets = preprocess(df.tweet)   

df['processed_tweets'] = processed_tweets
print(df[["tweet","processed_tweets"]].head(10))


                                               tweet  \
0  !!! RT @mayasolovely: As a woman you shouldn't...   
1  !!!!! RT @mleew17: boy dats cold...tyga dwn ba...   
2  !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...   
3  !!!!!!!!! RT @C_G_Anderson: @viva_based she lo...   
4  !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...   
5  !!!!!!!!!!!!!!!!!!"@T_Madison_x: The shit just...   
6  !!!!!!"@__BrighterDays: I can not just sit up ...   
7  !!!!&#8220;@selfiequeenbri: cause I'm tired of...   
8  " &amp; you might not get ya bitch back &amp; ...   
9  " @rhythmixx_ :hobbies include: fighting Maria...   

                                    processed_tweets  
0  woman complain clean hous amp man alway take t...  
1  boy dat cold tyga dwn bad cuffin dat hoe st place  
2         dawg ever fuck bitch start cri confus shit  
3                                   look like tranni  
4     shit hear might true might faker bitch told ya  
5      shit blow claim faith somebodi still fuck hoe 

In [191]:
# Assume 'df' is your DataFrame with the dataset
X = df[['processed_tweets', 'count', 'offensive_language', 'hate_speech' , 'neither']]
y = df['class']

# Split the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)



# Convert text to numerical features using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=10000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train['processed_tweets'])
X_val_tfidf = tfidf_vectorizer.transform(X_val['processed_tweets'])

# Concatenate TF-IDF features with other numerical features
X_train_final = X_train[['count', 'offensive_language', 'hate_speech','neither']].values
X_train_final = hstack([X_train_tfidf, X_train_final])
X_val_final = X_val[['count', 'offensive_language', 'hate_speech','neither']].values
X_val_final = hstack([X_val_tfidf, X_val_final])

# Train a model (e.g., RandomForestClassifier)
clf =  LogisticRegression(random_state=42, max_iter=500, C=0.00005)
clf.fit(X_train_final, y_train)

# Make predictions
predictions = clf.predict(X_val_final)

# Evaluate the model
accuracy = accuracy_score(y_val, predictions)
print(f"Accuracy: {accuracy}")
print(classification_report(y_val, predictions))

Accuracy: 0.8908614081097438
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       290
           1       0.88      1.00      0.93      3832
           2       1.00      0.70      0.82       835

    accuracy                           0.89      4957
   macro avg       0.62      0.57      0.59      4957
weighted avg       0.85      0.89      0.86      4957



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [192]:
# Example sentence
new_sentence ='i love women'

# Preprocess the new sentence
new_sentence = new_sentence.lower()
new_sentence = pd.Series(new_sentence).replace('[^a-zA-Z0-9]', ' ', regex=True)[0]

# Convert the new sentence to TF-IDF features
new_sentence_tfidf = tfidf_vectorizer.transform([new_sentence])

# Add other numerical features if needed
# Example:
new_numerical_features = [0, 0, 0, 0]  # Replace with your own numerical features
new_sentence_final = hstack([new_sentence_tfidf, new_numerical_features])

# Make predictions for the new sentence
prediction = clf.predict(new_sentence_final)

# Print the prediction
print(f"The prediction for the sentence is: {prediction[0]}")

The prediction for the sentence is: 1
