In [1]:
import pandas as pd

#import text
data = pd.read_csv('/home/jandolina/teams/jack_areen_rubin/Data/fbpac-ads-en-US.csv')

# Using presence of words to determine whether an ad is related to the 2nd amendment or gun policy

In [2]:
gun_related_words = ['gun','2nd amendment', 'second amendment', 'ar-15', 'assault rifle', 'pistols', 'shooting', 'mass shooting', 'school shooting', 'march for our lives']

# Function to check if any word from the list is present in the text
def contains_word(text, word_list):
    for word in word_list:
        if word in text:
            return 1
    return 0

# Create the new column
data['dummy_label'] = data['message'].apply(lambda x: contains_word(x, gun_related_words))

In [3]:
gun_code = data[data['dummy_label'] == 1]
no_gun_code = data[data['dummy_label'] == 0]

In [14]:
gun_code_sample=gun_code[:1000]
no_gun_code_sample = no_gun_code[:3000]

In [9]:
sample_df = data.sample(n=4000, random_state=42)

In [16]:
sample_df = pd.concat([gun_code_sample, no_gun_code_sample], axis=0)

In [17]:
print(sample_df[data['dummy_label'] == 1])

                                               id  \
72                              23842756757600772   
114                             23842761814840048   
327                             23843363987580262   
337                                 6102775332336   
348    hyperfeed_story_id_5c94f1dae77c55922092556   
...                                           ...   
35776                           23842797121080141   
35782                           23842974222350252   
35783                           23842794823000141   
35784                           23842722359490239   
35788                           23842794106150141   

                                                    html  political  \
72     <div class="_1dwg _1w_m _q7o"><div class="_5g-...          0   
114    <div class="_1dwg _1w_m _q7o"><div class="_5g-...          0   
327    <div class="_5pcr userContentWrapper"><div cla...          0   
337    <div class="_5pcr userContentWrapper"><div cla...          0   
348    <

  print(sample_df[data['dummy_label'] == 1])


In [18]:
# Export the DataFrame to a CSV file
#sample_df.to_csv('jack_handcode.csv', index=False)


# Data Descriptive Analysis

In [8]:
# Number of Documents
num_documents = len(data)

# Average Document Length (in terms of words)
data['word_count'] = data['message'].apply(lambda x: len(str(x).split()))
average_doc_length = data['word_count'].mean()

# Distribution of Document Lengths
doc_length_distribution = data['word_count'].describe()

# Balance of the Dataset
label_distribution = data['dummy_label'].value_counts(normalize=True)

print("Number of Documents:", num_documents)
print("Average Document Length (in words):", average_doc_length)
print("\nDistribution of Document Lengths:")
print(doc_length_distribution)
print("\nBalance of the Dataset:")
print(label_distribution)

Number of Documents: 162324
Average Document Length (in words): 55.08601315886745

Distribution of Document Lengths:
count    162324.000000
mean         55.086013
std         103.500739
min           1.000000
25%          22.000000
50%          36.000000
75%          57.000000
max        2976.000000
Name: word_count, dtype: float64

Balance of the Dataset:
dummy_label
0    0.972555
1    0.027445
Name: proportion, dtype: float64


# TFIDF

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import re

# Filter documents related to gun legislation
gun_legislation_messages = [data['message'][i] for i, label in enumerate(data['dummy_label']) if label == 1]

# Custom function for text cleaning
def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Remove extra whitespaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Create TF-IDF vectorizer with text cleaning and stop words removal
vectorizer = TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS, preprocessor=clean_text)

# Fit and transform the documents
tfidf_matrix = vectorizer.fit_transform(gun_legislation_messages)

# Get feature names (words)
feature_names = vectorizer.get_feature_names()

# Sum TF-IDF scores for each word across documents
word_scores = tfidf_matrix.sum(axis=0)

# Sort the words by TF-IDF score in descending order
sorted_word_indices = word_scores.argsort()[0, ::-1]

# Display the top words associated with gun legislation
print("Top words associated with gun legislation:")
for i in range(10):  # Display top 10 words
    word_index = sorted_word_indices[0, i]
    word = feature_names[word_index]
    score = word_scores[0, word_index]
    print(f"{word}: {score}")


Top words associated with gun legislation:
gun: 238.4225606855614
violence: 140.2045461840269
congress: 103.94849943441567
help: 99.16536413251045
petition: 96.11251731509047
pp: 92.72110110178767
guns: 92.22281608193832
people: 89.22734646432878
sign: 88.34178056687114
make: 86.0165474003404


# LDA
Need to remove the html elements

In [25]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

# Create CountVectorizer with text cleaning and stop words removal
vectorizer = CountVectorizer(stop_words=ENGLISH_STOP_WORDS, preprocessor=clean_text)

# Fit and transform the documents
X = vectorizer.fit_transform(gun_legislation_messages)

# Perform LDA
lda = LatentDirichletAllocation(n_components=2, random_state=42)  # Change n_components as needed
lda.fit(X)

# Display the top words for each topic
feature_names = vectorizer.get_feature_names()
print("Top words for each topic:")
for topic_idx, topic in enumerate(lda.components_):
    print(f"Topic {topic_idx}:")
    top_words_indices = topic.argsort()[:-11:-1]  # Display top 10 words
    top_words = [feature_names[i] for i in top_words_indices]
    print(", ".join(top_words))

Top words for each topic:
Topic 0:
people, pp, gun, time, day, violence, class_afxspan, class_cl, _afzspanspan, world
Topic 1:
gun, help, violence, pp, congress, guns, petition, need, make, background


# Sentiment Analysis

In [26]:
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

# Initialize the sentiment analyzer
nltk.download('vader_lexicon')  # Download the lexicon required for sentiment analysis
sid = SentimentIntensityAnalyzer()

# Perform sentiment analysis for each message
sentiments = []
for message in data['message']:
    sentiment_score = sid.polarity_scores(message)
    # Classify sentiment based on compound score
    if sentiment_score['compound'] >= 0.05:
        sentiment = 'Positive'
    elif sentiment_score['compound'] <= -0.05:
        sentiment = 'Negative'
    else:
        sentiment = 'Neutral'
    sentiments.append(sentiment)

# Add the sentiment labels to the DataFrame
data['sentiment'] = sentiments

# Display the DataFrame with sentiment labels
print("Sentiment analysis results:")
print(data)


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/jandolina/nltk_data...


Sentiment analysis results:
                                                id  \
0       hyperfeed_story_id_5c9baa3ee0ec08073500042   
1       hyperfeed_story_id_5c9bb2a2413852086735771   
2       hyperfeed_story_id_5c9bb4fa461731e29426627   
3                                23843380741530360   
4       hyperfeed_story_id_5c9bb059454851c17741213   
...                                            ...   
162319                           23843108782710078   
162320                           23843034525850259   
162321                           23842997138670612   
162322  hyperfeed_story_id_5c8b16b11b8f86515960964   
162323                           23842885237930242   

                                                     html  political  \
0       <div class="_5pa- userContentWrapper"><div cla...          0   
1       <div class="_5pa- userContentWrapper"><div cla...          0   
2       <div class="_5pa- userContentWrapper"><div cla...          0   
3       <div class="_5pcr userConte

# Named Entity Recognition

In [27]:
import spacy

# Load the English NER model in spaCy
nlp = spacy.load("en_core_web_sm")

# Process the text with spaCy NER
doc = nlp(text)

# Function to perform Named Entity Recognition (NER) using spaCy
def perform_ner(text):
    doc = nlp(text)
    named_entities = [(entity.text, entity.label_) for entity in doc.ents]
    return named_entities

# Apply NER to each text in the 'text' column
data['named_entities'] = data['text'].apply(perform_ner)



ModuleNotFoundError: No module named 'spacy'