<a href="https://www.kaggle.com/code/thethirdchapter/profanity-score?scriptVersionId=148438814" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
#importing libraries

#importing data manipulation libraries
import pandas as pd
import numpy as np

#importing regex library
import re

#importing vizualization libraries
from bokeh.plotting import figure, show
from bokeh.models import ColumnDataSource
from bokeh.transform import factor_cmap
from bokeh.models import Whisker

## Data Preparation

In [2]:
#reading datasets
#tweet datasets
df= pd.read_csv(r"/kaggle/input/democratvsrepublicantweets/ExtractedTweets.csv")
df.head()

Unnamed: 0,Party,Handle,Tweet
0,Democrat,RepDarrenSoto,"Today, Senate Dems vote to #SaveTheInternet. P..."
1,Democrat,RepDarrenSoto,RT @WinterHavenSun: Winter Haven resident / Al...
2,Democrat,RepDarrenSoto,RT @NBCLatino: .@RepDarrenSoto noted that Hurr...
3,Democrat,RepDarrenSoto,RT @NALCABPolicy: Meeting with @RepDarrenSoto ...
4,Democrat,RepDarrenSoto,RT @Vegalteno: Hurricane season starts on June...


In [3]:
#renaming columns for ease of accessiblity
df.columns= df.columns.str.lower().str.strip()
df.columns

Index(['party', 'handle', 'tweet'], dtype='object')

In [4]:
#creating a set of profane terms using dataset available in kaggle
df_profane= pd.read_csv(r"/kaggle/input/bad-bad-words/bad-words.csv")
set_profane= set(df_profane["jigaboo"])

In [5]:
def calculate_profanity_score(tweet):
    
    try:
        # converts tweet to lowercase
        tweet = tweet.lower()  
        #removes punctuation marks
        tweet = re.sub(r"[^\w\s]", "", tweet) 
        # count occurrences of each profanity in the tweet
        total_profanity_words = sum(tweet.count(word) for word in set_profane) 
        # calculate the profanity score as a ratio of profanity words to total words in the tweet
        profanity_score = total_profanity_words / len(tweet.split())  
        
    #dealing with ZeroDivisionError
    except ZeroDivisionError:
        profanity_score= 0 
        
    return profanity_score

In [6]:
#storing the profanity score of each tweet in a new column
df["profanity_score"]= df.apply(lambda row: calculate_profanity_score(row["tweet"]), axis= 1)

In [7]:
df.head()

Unnamed: 0,party,handle,tweet,profanity_score
0,Democrat,RepDarrenSoto,"Today, Senate Dems vote to #SaveTheInternet. P...",0.058824
1,Democrat,RepDarrenSoto,RT @WinterHavenSun: Winter Haven resident / Al...,0.0
2,Democrat,RepDarrenSoto,RT @NBCLatino: .@RepDarrenSoto noted that Hurr...,0.157895
3,Democrat,RepDarrenSoto,RT @NALCABPolicy: Meeting with @RepDarrenSoto ...,0.055556
4,Democrat,RepDarrenSoto,RT @Vegalteno: Hurricane season starts on June...,0.0


In [8]:
df.sort_values("profanity_score", ascending= False).head()

Unnamed: 0,party,handle,tweet,profanity_score
45607,Republican,HouseSmallBiz,@ShopFloorNAM 😍👍🏽,2.0
38228,Democrat,RepHankJohnson,#EndGunViolence https://t.co/FfgDi9nBbQ,1.0
82336,Republican,RepScottPerry,RT @justinamash: …@Raul_Labrador-@repblumenaue...,1.0
81657,Republican,DanaRohrabacher,@abushofghosts nope,1.0
48193,Republican,RepFrenchHill,#SchumerShutdown https://t.co/WnhTGUOZY6,1.0


## Data Vizualization

In [9]:
#Creating custom color palette
color_palette= ("#041562", '#DA1212')
#defining buffer zone
buffer= 0.01

In [10]:
# Computing Average Profanity Score
top_avg_profanity_score = (
    df
    .groupby(["handle", "party"], as_index=False)
    .agg({'profanity_score': ['mean', 'std']})
    .sort_values(("profanity_score", "mean"), ascending=False)
    .iloc[:20]
    .assign(
        upper_lim = lambda df_: df_.loc[:, ("profanity_score", "mean")]+ df_.loc[:, ("profanity_score", "std")],
        lower_lim= lambda df_: df_.loc[:, ("profanity_score", "mean")]- df_.loc[:, ("profanity_score", "std")]
    )
)
# top_avg_profanity_score
# Creating ColumnDataSource
source = ColumnDataSource(top_avg_profanity_score)

# Defining Color Mapping
color_map = factor_cmap(
    field_name="party_",
    palette=color_palette,
    factors=np.unique(source.data["party_"])
)

# Creating the Figure
p = figure(
    x_axis_label= "Twitter Handles",
    x_range= source.data["handle_"],
    y_axis_label= "Profanity Score",
    y_range= (0, source.data["upper_lim_"].max()+ buffer),
    title= "Top 20 Most Profane Politicians/Handles",
    width= 1200,
    aspect_ratio= 2.0,
    tools= "save"
)

# Adding Error Bars
p.add_layout(
    Whisker(
        base= "handle_", upper= "upper_lim_", 
        lower= "lower_lim_", source= source, 
        level= "annotation",
        line_width= 1
    )
)

# Adding Vertical Bars to the Plot
p.vbar(
    x= "handle_", top= "profanity_score_mean", source=source,
    color=color_map,
    width=0.9,
    legend_field="party_"
)

# Adjusting X-Axis Labels Orientation
p.xaxis.major_label_orientation = np.pi / 3

# Displaying the Plot
show(p)

In [11]:
# Computing Average Profanity Score
least_avg_profanity_score = (
    df
    .groupby(["handle", "party"], as_index=False)
    .agg({'profanity_score': ['mean', 'std']})
    .sort_values(("profanity_score", "mean"))
    .iloc[:20]
    .assign(
        upper_lim = lambda df_: df_.loc[:, ("profanity_score", "mean")]+ df_.loc[:, ("profanity_score", "std")],
        lower_lim= lambda df_: df_.loc[:, ("profanity_score", "mean")]- df_.loc[:, ("profanity_score", "std")]
    )
)

# Creating ColumnDataSource
source = ColumnDataSource(least_avg_profanity_score)

# Defining Color Mapping
color_map = factor_cmap(
    field_name="party_",
    palette=color_palette,
    factors=np.unique(source.data["party_"])
)

# Creating the Figure
p = figure(
    x_axis_label="Twitter Handles",
    x_range=source.data["handle_"].tolist(),
    y_axis_label="Profanity Score",
    y_range=(0, source.data["upper_lim_"].max()+ buffer),
    title="Top 20 Most Profane Politicians/Handles",
    width=1200,
    aspect_ratio=2.0,
    tools="save"
)

# Adding Error Bars
p.add_layout(
    Whisker(
        base="handle_", upper="upper_lim_", 
        lower="lower_lim_", source=source, 
        level= "annotation",
        line_width=1
    )
)

# Adding Vertical Bars to the Plot
p.vbar(
    x='handle_', top='profanity_score_mean', source=source,
    color=color_map,
    width=0.9,
    legend_field="party_"
)

# Adjusting X-Axis Labels Orientation
p.xaxis.major_label_orientation = np.pi / 3

# Displaying the Plot
show(p)

In [12]:
source.data["party_"]

array(['Democrat', 'Republican', 'Democrat', 'Republican', 'Republican',
       'Republican', 'Republican', 'Republican', 'Republican', 'Democrat',
       'Republican', 'Democrat', 'Republican', 'Republican', 'Republican',
       'Democrat', 'Republican', 'Democrat', 'Republican', 'Democrat'],
      dtype=object)

In [13]:
# Computing the Average Profanity Score for each Party
party_profanity_score = (
    df
    .groupby("party", as_index=False)
    .agg({"profanity_score": ["mean", "std"]})
    .assign(
        upper_lim = lambda df_: df_.loc[:, ("profanity_score", "mean")]+ df_.loc[:, ("profanity_score", "std")],
        lower_lim= lambda df_: df_.loc[:, ("profanity_score", "mean")]- df_.loc[:, ("profanity_score", "std")]
    )
)

# Creating a ColumnDataSource
source = ColumnDataSource(party_profanity_score)

# Defining Color Mapping
color_map = factor_cmap(
    field_name="party_",
    palette=color_palette,
    factors=np.unique(source.data["party_"])
)

# Creating the Figure
p = figure(
    x_axis_label= "Party",
    x_range= source.data["party_"],
    y_axis_label= "Profanity Score",
    y_range= (0, source.data["upper_lim_"].max()+ buffer),
    title= "Average Profanity Score of Parties",
    tools= "save",
)

#Adding Error Bar
p.add_layout(
    Whisker(
        base= "party_", 
        upper= "upper_lim_", lower= "lower_lim_",source= source, 
        level= "annotation"
    )
)

# Adding Vertical Bars to the Plot
p.vbar(
    x="party_", top="profanity_score_mean", source=source,
    color=color_map,
    width=0.9
)



# Displaying the Plot
show(p)