In [4]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.decomposition import LatentDirichletAllocation
import re
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

# Define the list of gun-related words
gun_related_words = ['gun', '2nd amendment', 'second amendment', 'ar-15', 'assault rifle', 'pistols', 'shooting', 'mass shooting', 'school shooting', 'march for our lives']

# Function to check if any word from the list is present in the text
def contains_word(text, word_list):
    if pd.isnull(text):  # Check if the text is NaN
        return 0
    for word in word_list:
        if word.lower() in text.lower():  # Case insensitive match
            return 1
    return 0

# Load the CSV file into a pandas DataFrame
file_path = 'areen-prehandcode.csv' 
data = pd.read_csv(file_path)

# Create the new column 'dummy_label'
data['dummy_label'] = data['message'].apply(lambda x: contains_word(x, gun_related_words))

# Calculate number of documents
num_documents = len(data)

# Calculate average document length (in terms of words)
data['word_count'] = data['message'].apply(lambda x: len(str(x).split()))
average_doc_length = data['word_count'].mean()

# Distribution of document lengths
doc_length_distribution = data['word_count'].describe()

# Balance of the dataset
label_distribution = data['dummy_label'].value_counts(normalize=True)

print("Number of Documents:", num_documents)
print("Average Document Length (in words):", average_doc_length)
print("\nDistribution of Document Lengths:")
print(doc_length_distribution)
print("\nBalance of the Dataset:")
print(label_distribution)

# Filter documents related to gun legislation
gun_legislation_messages = [data['message'][i] for i, label in enumerate(data['dummy_label']) if label == 1]

# Custom function for text cleaning
def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Remove extra whitespaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Convert ENGLISH_STOP_WORDS to a list
stop_words_list = list(ENGLISH_STOP_WORDS)

# Create TF-IDF vectorizer with text cleaning and stop words removal
tfidf_vectorizer = TfidfVectorizer(stop_words=stop_words_list, preprocessor=clean_text)

# Fit and transform the documents
tfidf_matrix = tfidf_vectorizer.fit_transform(gun_legislation_messages)

# Get feature names (words)
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()

# Sum TF-IDF scores for each word across documents
word_scores = tfidf_matrix.sum(axis=0)

# Sort the words by TF-IDF score in descending order
sorted_word_indices = word_scores.argsort()[0, ::-1]

# Display the top words associated with gun legislation
print("\nTop words associated with gun legislation:")
for i in range(10):  # Display top 10 words
    word_index = sorted_word_indices[0, i]
    word = tfidf_feature_names[word_index]
    score = word_scores[0, word_index]
    print(f"{word}: {score}")

# Create CountVectorizer with text cleaning and stop words removal
count_vectorizer = CountVectorizer(stop_words=stop_words_list, preprocessor=clean_text)

# Fit and transform the documents
X = count_vectorizer.fit_transform(gun_legislation_messages)

# Perform LDA
lda = LatentDirichletAllocation(n_components=2, random_state=42)  # Change n_components as needed
lda.fit(X)

# Display the top words for each topic
feature_names = count_vectorizer.get_feature_names_out()
print("\nTop words for each topic:")
for topic_idx, topic in enumerate(lda.components_):
    print(f"Topic {topic_idx}:")
    top_words_indices = topic.argsort()[:-11:-1]  # Display top 10 words
    top_words = [feature_names[i] for i in top_words_indices]
    print(", ".join(top_words))

# Perform sentiment analysis for each message
nltk.download('vader_lexicon')  # Download the lexicon required for sentiment analysis
sid = SentimentIntensityAnalyzer()

# Perform sentiment analysis for each message
sentiments = []
for message in data['message']:
    sentiment_score = sid.polarity_scores(message)
    # Classify sentiment based on compound score
    if sentiment_score['compound'] >= 0.05:
        sentiment = 'Positive'
    elif sentiment_score['compound'] <= -0.05:
        sentiment = 'Negative'
    else:
        sentiment = 'Neutral'
    sentiments.append(sentiment)

# Add the sentiment labels to the DataFrame
data['sentiment'] = sentiments

# Display the DataFrame with sentiment labels
print("\nSentiment analysis results:")
print(data[['message', 'sentiment']])

Number of Documents: 200
Average Document Length (in words): 50.19

Distribution of Document Lengths:
count    200.00000
mean      50.19000
std       46.56852
min        7.00000
25%       23.00000
50%       40.00000
75%       60.50000
max      370.00000
Name: word_count, dtype: float64

Balance of the Dataset:
dummy_label
0    0.975
1    0.025
Name: proportion, dtype: float64

Top words associated with gun legislation:
fair: 0.489678716613991
background: 0.45312097254862194
congress: 0.4351420831574906
safe: 0.4007581783546891
nra: 0.3858366759919961
districts: 0.3672590374604932
action: 0.35761331334392155
stop: 0.3527582118648766
gun: 0.3527582118648766
sign: 0.32453484921680203

Top words for each topic:
Topic 0:
congress, background, check, brady, moving, safe, gun, stop, nra, sign
Topic 1:
fair, districts, ballot, effort, current, th, necessary, signatures, november, room


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Areen\AppData\Roaming\nltk_data...



Sentiment analysis results:
                                               message sentiment
0    <p>Refugee women are breadwinners, caretakers ...  Positive
1    <p>Is your church ready for Refugee Sunday?</p...  Positive
2    <p>We're launching a new version of IN THESE T...  Positive
3    <p>We’re a campaign that deeply believes that ...  Negative
4    <p>These women are pee’d off.</p><p> Curious? ...  Positive
..                                                 ...       ...
195  <p>I am asking you to make a contribution to o...  Positive
196  <span class="fwn fcg"><span class="fcg"><span ...  Positive
197  <p>“One thing that’s obvious is that the defea...  Negative
198  <p>I’ll be blunt: I need you to take my poll b...  Positive
199  <p>The biggest network of conservatives you'll...  Positive

[200 rows x 2 columns]


In [None]:
import pandas as pd
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

# Load the dataset containing political text data (e.g., speeches, news articles)
# Replace 'political_data.csv' with the path to your dataset
political_data = pd.read_csv('areen-prehandcode.csv')

# Initialize the sentiment analyzer
nltk.download('vader_lexicon')  # Download the lexicon required for sentiment analysis
sid = SentimentIntensityAnalyzer()

# Perform sentiment analysis for each political text
political_data['sentiment_score'] = political_data['text'].apply(lambda x: sid.polarity_scores(x)['compound'])

# Classify sentiment based on the compound score
political_data['sentiment'] = political_data['sentiment_score'].apply(lambda x: 'Positive' if x >= 0.05 else ('Negative' if x <= -0.05 else 'Neutral'))

# Analyze sentiment distribution
sentiment_distribution = political_data['sentiment'].value_counts(normalize=True)

# Display sentiment distribution
print("Sentiment Distribution:")
print(sentiment_distribution)