In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import csr_matrix, hstack
import numpy as np



In [2]:
# Load the dataset
tweets_d = pd.read_csv('Data/tweets_disc_cleaned.csv')
tweets_c = pd.read_csv('Data/tweets_ctrl_cleaned.csv')
data = pd.concat([tweets_d, tweets_c])
data

Unnamed: 0,date,id,content,likes,retweets,disc_cat,Unnamed: 0.1,disc_cat_num
0,28 February 2023,1630587451436683264,bitch lmfaooooo gone rock sock ass,0,0,gender,,2
1,28 February 2023,1630587450710958081,nvr vw person simpli cost repair hassl get don...,0,0,gender,,2
2,28 February 2023,1630587450673319940,hate wen bitch say stop act care df said act l...,0,0,gender,,2
3,28 February 2023,1630587450547634179,dnc pay well ask brooklyn dad hes lucki get pe...,0,0,gender,,2
4,28 February 2023,1630587449503264769,sinong bitch ba yung hindi marunong mag sorri ...,0,0,gender,,2
...,...,...,...,...,...,...,...,...
99996,01 March 2023,1630854187658489856,happi st david day everyon wale council blind ...,0,0,control,,0
99997,01 March 2023,1630854187603959808,congrat sir pls sir want sir better nigeria pu...,0,0,control,,0
99998,01 March 2023,1630854187595472896,ne abbiamo sempr bisogno,0,0,control,,0
99999,01 March 2023,1630854187587100672,wait video channel bro,0,0,control,,0


In [3]:
# Remove rows with NaN values
data = data.dropna()

In [4]:
data

Unnamed: 0,date,id,content,likes,retweets,disc_cat,Unnamed: 0.1,disc_cat_num
105210,02 March 2023,1631255697756405762,sam v wish new littl gf rot depht karma cuz bi...,0,0,gender,0.0,2
105211,02 March 2023,1631255696724336640,ye bharwi yehi kuch kr sakti hein dakait terro...,0,0,gender,1.0,2
105212,02 March 2023,1631255695059472385,thought pattern becom much much wider ca cancer,0,0,gender,2.0,2
105213,02 March 2023,1631255691955511297,sound bitch,0,0,gender,3.0,2
105214,02 March 2023,1631255691724816385,bitchsham candac mayb enough peopl shame canda...,0,0,gender,4.0,2
...,...,...,...,...,...,...,...,...
210912,28 February 2023,1630401486483251203,state lost confederaci,0,0,mental_health,105705.0,4
210914,28 February 2023,1630401254798118913,make sens,1,0,mental_health,105707.0,4
210915,28 February 2023,1630401175995527169,mean make sens miz host,0,0,mental_health,105708.0,4
210916,28 February 2023,1630400994222948352,disrespect key lime pieoh use lubric,1,0,mental_health,105709.0,4


In [5]:
# Create a bag of words representation of the tweets
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(data['content'])

In [6]:
discriminatory_words_pd= pd.read_csv('Data/discriminatory_words.csv')
discriminatory_words_pd

Unnamed: 0,sex,gender,race,health,mental_health
0,bitch,fag,n-word,crazy,crazy
1,whore,dyke,nigga,lunatic,lunatic
2,slut,tranny,nigger,psycho,psycho
3,cunt,shemale,spic,nutjob,nutjob
4,ho,chick with a dick,chink,schizo,schizo
...,...,...,...,...,...
68,manspread,,,,
69,bro,,,,
70,dude,,,,
71,guy,,,,


In [7]:
discriminatory_words_pd.dtypes

sex              object
gender           object
race             object
health           object
mental_health    object
dtype: object

In [8]:
discriminatory_words_pd = discriminatory_words_pd.astype(str)

In [9]:
discriminatory_words = {'sex': discriminatory_words_pd['sex'], 'gender' : discriminatory_words_pd['gender'], 'race' : discriminatory_words_pd['race'], 'health' : discriminatory_words_pd['health'], 'mental_health' : discriminatory_words_pd['mental_health']}

In [10]:
from scipy.sparse import hstack

# Add features for the presence of each discriminatory word
for category, words in discriminatory_words_pd.items():
    category_cols = [vectorizer.vocabulary_.get(word) for word in words]
    category_cols = [col for col in category_cols if col is not None and col in data.columns]
    if len(category_cols) > 0:
        word_col = data[category_cols].sum(axis=1)
        X = hstack([X, csr_matrix(word_col.values.reshape(-1, 1))])
    else:
        X = hstack([X, csr_matrix(np.zeros((data.shape[0], 1)))])
        
X = X.tocsr()

In [11]:
data[category_cols]

105210
105211
105212
105213
105214
...
210912
210914
210915
210916
210917


In [12]:
# Convert all values in discriminatory_words to strings
for category, words in discriminatory_words.items():
    discriminatory_words[category] = [str(word) for word in words]

# Compute the number of discriminatory words in each tweet
num_discriminatory_words = []
num_discriminatory_words = [sum(data['content'].str.contains(word) for word in words) for category, words in discriminatory_words.items() for i in range(len(data))]

# Add the new column to the dataframe
data['num_discriminatory_words'] = num_discriminatory_words

KeyError: 0

In [None]:
# Use the number of discriminatory words as the label
y = data['num_discriminatory_words']

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Train a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
# Evaluate the model
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean squared error: {mse}')

In [None]:
# Provide suggestions based on the number of discriminatory words
if y_pred < 2:
    print('This tweet contains no discriminatory language.')
elif y_pred < 5:
    print('Consider revising this tweet to remove any potentially discriminatory language.')
else:
    print('This tweet contains a high number of discriminatory words and should be reviewed carefully.')
