In [8]:
import pandas as pd
import numpy as np
import re
from collections import Counter
import csv

In [4]:
df = pd.read_csv('data/completeDataset_inference.csv')
df.head()

Unnamed: 0,Date,Tweet Treated,Tweet Raw,Url,Id,index,model_inference,normal_score,hate_speech_score
0,2022-01-01 23:59:41+00:00,goldfinger &amp; from russia with love,@donwinslow Goldfinger &amp; From Russia with ...,https://twitter.com/coolvee2222/status/1477429...,1477429351952130051,0,nothate,0.9996,0.0004
1,2022-01-01 23:59:23+00:00,thank you! russia musume 1 or dream note please!,@aiuncensored thank you! russia musume 1 or dr...,https://twitter.com/CsarVsq30904305/status/147...,1477429275926179841,1,nothate,0.9997,0.0003
2,2022-01-01 23:59:23+00:00,"new in shop! vintage russian nesting dolls, cl...","New in Shop! Vintage Russian Nesting Dolls, Cl...",https://twitter.com/trashtique/status/14774292...,1477429275024310273,2,nothate,0.9997,0.0003
3,2022-01-01 23:59:13+00:00,if the chinese government donated to the gqp ...,@RepThomasMassie If the Chinese government don...,https://twitter.com/SouthernNotSt/status/14774...,1477429235589566466,3,nothate,0.9987,0.0013
4,2022-01-01 23:58:55+00:00,i'm dying to know how redacted tonight fits i...,@jimstewartson I'm dying to know how Redacted ...,https://twitter.com/rscobe1920/status/14774291...,1477429159307661317,4,nothate,0.9999,0.0001


## Counting the words

In [7]:


# Extract the 'Tweet Raw' column
tweets = df['Tweet Raw'].dropna().tolist()

# Function to extract words with more than 4 characters
def extract_words(text):
    words = re.findall(r'\b\w{5,}\b', text.lower())
    return words

# Create a list of all words
all_words = []
for tweet in tweets:
    all_words.extend(extract_words(tweet))

# Count the frequency of each word
word_counts = Counter(all_words)

# Get the top 50 most common words
top_50_words = dict(word_counts.most_common(50))

In [6]:
top_50_words

{'ukraine': 375763,
 'russia': 326928,
 'putin': 282789,
 'https': 274969,
 'zelensky': 234300,
 'about': 85426,
 'russian': 85384,
 'people': 77152,
 'their': 70590,
 'would': 69238,
 'there': 63935,
 'biden': 56351,
 'trump': 52552,
 'world': 52532,
 'president': 46318,
 'because': 42761,
 'think': 42659,
 'ukrainian': 42430,
 'country': 41891,
 'should': 40852,
 'china': 37442,
 'against': 36925,
 'money': 35805,
 'going': 33973,
 'right': 33671,
 'after': 32333,
 'being': 30386,
 'military': 29489,
 'support': 28544,
 'other': 28514,
 'could': 27629,
 'still': 25537,
 'which': 25339,
 'never': 25037,
 'weapons': 24067,
 'years': 22697,
 'russians': 22564,
 'peace': 22471,
 'these': 22214,
 'really': 21498,
 'america': 21476,
 'before': 21279,
 'doesn': 21156,
 'those': 21143,
 'where': 20937,
 'nuclear': 20737,
 'invasion': 19538,
 'europe': 19454,
 'countries': 19030,
 'while': 18730}

In [8]:
# Initialize a dictionary to store the counts
model_inference_counts = {word: {'nothate': 0, 'hate': 0} for word in top_50_words.keys()}

# Iterate through the dataframe and count the occurrences
for index, row in df.iterrows():
    tweet_words = set(extract_words(row['Tweet Raw']))
    for word in tweet_words:
        if word in model_inference_counts:
            if row['model_inference'] == 'nothate':
                model_inference_counts[word]['nothate'] += 1
            else:
                model_inference_counts[word]['hate'] += 1

model_inference_counts

{'ukraine': {'nothate': 316110, 'hate': 14457},
 'russia': {'nothate': 269375, 'hate': 17049},
 'putin': {'nothate': 241980, 'hate': 15561},
 'https': {'nothate': 244661, 'hate': 8253},
 'zelensky': {'nothate': 209005, 'hate': 12884},
 'about': {'nothate': 71784, 'hate': 4633},
 'russian': {'nothate': 73568, 'hate': 3383},
 'people': {'nothate': 64513, 'hate': 4834},
 'their': {'nothate': 58782, 'hate': 3858},
 'would': {'nothate': 58436, 'hate': 2294},
 'there': {'nothate': 55600, 'hate': 2582},
 'biden': {'nothate': 46730, 'hate': 2714},
 'trump': {'nothate': 40376, 'hate': 2534},
 'world': {'nothate': 45075, 'hate': 3342},
 'president': {'nothate': 40567, 'hate': 1670},
 'because': {'nothate': 37984, 'hate': 2333},
 'think': {'nothate': 38316, 'hate': 1738},
 'ukrainian': {'nothate': 38020, 'hate': 1422},
 'country': {'nothate': 35753, 'hate': 3252},
 'should': {'nothate': 36038, 'hate': 2290},
 'china': {'nothate': 30635, 'hate': 2438},
 'against': {'nothate': 32949, 'hate': 1612},

In [9]:
# Add the rate between hate and nothate
for word, counts in model_inference_counts.items():
    nothate_count = counts['nothate']
    hate_count = counts['hate']
    if nothate_count > 0:
        rate = hate_count / nothate_count
    else:
        rate = float('inf')  # Handle division by zero
    model_inference_counts[word]['rate'] = rate

model_inference_counts

{'ukraine': {'nothate': 316110, 'hate': 14457, 'rate': 0.04573407990889247},
 'russia': {'nothate': 269375, 'hate': 17049, 'rate': 0.06329095127610208},
 'putin': {'nothate': 241980, 'hate': 15561, 'rate': 0.0643069675179767},
 'https': {'nothate': 244661, 'hate': 8253, 'rate': 0.033732388897290536},
 'zelensky': {'nothate': 209005, 'hate': 12884, 'rate': 0.06164445826654865},
 'about': {'nothate': 71784, 'hate': 4633, 'rate': 0.0645408447564917},
 'russian': {'nothate': 73568, 'hate': 3383, 'rate': 0.04598466724662897},
 'people': {'nothate': 64513, 'hate': 4834, 'rate': 0.07493063413575558},
 'their': {'nothate': 58782, 'hate': 3858, 'rate': 0.06563233642951924},
 'would': {'nothate': 58436, 'hate': 2294, 'rate': 0.03925662262988569},
 'there': {'nothate': 55600, 'hate': 2582, 'rate': 0.04643884892086331},
 'biden': {'nothate': 46730, 'hate': 2714, 'rate': 0.058078322276909906},
 'trump': {'nothate': 40376, 'hate': 2534, 'rate': 0.06276005547850208},
 'world': {'nothate': 45075, 'hat

## Replacing the most common names

In [5]:
slavic_names_gender = {
    "Aleksandr": "M", "Boris": "M", "Bogdan": "M", "Dmitry": "M", "Igor": "M", 
    "Ivan": "M", "Kirill": "M", "Maksim": "M", "Mikhail": "M", "Pavel": "M",
    "Anastasia": "F", "Elena": "F", "Katarina": "F", "Evgenia": "F", "Marina": "F", 
    "Nina": "F", "Olga": "F", "Svetlana": "F", "Tatyana": "F", "Yana": "F",
    "Putin": "M", "Zelenzky": "M"
}

In [9]:
# Function to replace words

np.random.seed(1)

def replace_words(text):
    words_to_replace = ["putin", "zelensky"]
    random_word = np.random.choice(list(slavic_names_gender.keys()))
    for word in words_to_replace:
        text = re.sub(r'\b' + word + r'\b', random_word.lower(), text, flags=re.IGNORECASE)
    return text

# Create a new column with replaced words
df['Tweet Replaced'] = df['Tweet Treated'].apply(replace_words)
df['Raplaced'] = df['Tweet Treated'] != df['Tweet Replaced']
df.head()

Unnamed: 0,Date,Tweet Treated,Tweet Raw,Url,Id,index,model_inference,normal_score,hate_speech_score,Tweet Replaced,Raplaced
0,2022-01-01 23:59:41+00:00,goldfinger &amp; from russia with love,@donwinslow Goldfinger &amp; From Russia with ...,https://twitter.com/coolvee2222/status/1477429...,1477429351952130051,0,nothate,0.9996,0.0004,goldfinger &amp; from russia with love,False
1,2022-01-01 23:59:23+00:00,thank you! russia musume 1 or dream note please!,@aiuncensored thank you! russia musume 1 or dr...,https://twitter.com/CsarVsq30904305/status/147...,1477429275926179841,1,nothate,0.9997,0.0003,thank you! russia musume 1 or dream note please!,False
2,2022-01-01 23:59:23+00:00,"new in shop! vintage russian nesting dolls, cl...","New in Shop! Vintage Russian Nesting Dolls, Cl...",https://twitter.com/trashtique/status/14774292...,1477429275024310273,2,nothate,0.9997,0.0003,"new in shop! vintage russian nesting dolls, cl...",False
3,2022-01-01 23:59:13+00:00,if the chinese government donated to the gqp ...,@RepThomasMassie If the Chinese government don...,https://twitter.com/SouthernNotSt/status/14774...,1477429235589566466,3,nothate,0.9987,0.0013,if the chinese government donated to the gqp ...,False
4,2022-01-01 23:58:55+00:00,i'm dying to know how redacted tonight fits i...,@jimstewartson I'm dying to know how Redacted ...,https://twitter.com/rscobe1920/status/14774291...,1477429159307661317,4,nothate,0.9999,0.0001,i'm dying to know how redacted tonight fits i...,False


In [10]:
df_20k = pd.read_csv('data/hs_df.csv')

df_20k.head()

Unnamed: 0,row_id,Row Number,Tweet Treated,Label
0,fffeb77ff91c618cc5482e982240f1af9f09175cddf324...,19999,russia would save a ton of money if they'd pul...,Non Hate
1,ffe7a960fccf628755ee70ab15e4fab5b45f0f436a2064...,19998,i hate grocery shopping. spent $112. damn you ...,Non Hate
2,ffdc308f7c1ceed12d8347ab9150551a9fe155023d624a...,19997,did you miss his blood and soil arguments f...,Non Hate
3,ffdbe9613a9ef0c6c484486e03422ab0bac73f62922005...,19996,"on imperialism, too complex for twitter, but ...",Non Hate
4,ffd75a28b8bf37f681e5d57d1dd1309df03aa3859a3d28...,19995,i'm still wondering why we don't #stop doing #...,Non Hate


In [11]:
# Create a new column with replaced words
df_20k['Tweet Replaced'] = df_20k['Tweet Treated'].apply(replace_words)
df_20k['Raplaced'] = df_20k['Tweet Treated'] != df_20k['Tweet Replaced']

In [12]:
df.to_csv("data/completeDatasetUnamed.csv", index=False, quoting=csv.QUOTE_NONNUMERIC)
df_20k.to_csv("data/hs_dfUnamed.csv", index=False, quoting=csv.QUOTE_NONNUMERIC)