In [1]:
import pandas as pd
import time
from pathlib import Path
start_time = time.time()

my_csv = Path("C:/Users/adika/OneDrive/Desktop/ub/summer23/DIC_587/project/shooting-1982-2023.csv")
df = pd.read_csv(my_csv.resolve(), sep=',')

end_time = time.time()
elapsed_time = end_time - start_time

print("Time taken to load the CSV file:", elapsed_time, "seconds")


Time taken to load the CSV file: 0.058754682540893555 seconds


In [2]:
df.dtypes

case                                object
location                            object
date                                object
summary                             object
fatalities                           int64
injured                              int64
total_victims                        int64
location.1                          object
age_of_shooter                       int64
prior_signs_mental_health_issues    object
mental_health_details               object
weapons_obtained_legally            object
where_obtained                      object
weapon_type                         object
weapon_details                      object
race                                object
gender                              object
type                                object
year                                 int64
dtype: object

In [8]:
df['summary'] = df['summary'].astype('string')

In [9]:
start_time = time.time()
df = df.dropna(subset=['summary'])
df['summary'] = df['summary'].str.lower()
df['summary'] = df['summary'].str.split()

word_count = {}

for row in df['summary']:
    for word in row:
        word_count[word] = word_count.get(word, 0) + 1

word_count_df = pd.DataFrame.from_dict(word_count, orient='index', columns=['Frequency'])
word_count_df.index.name = 'Word'
word_count_df = word_count_df.sort_values(by='Frequency', ascending=False)
print(word_count_df)

end_time = time.time()
elapsed_time = end_time - start_time
print("Time taken to perform word count", elapsed_time, "seconds")


                        Frequency
Word                             
"'a',",                       283
"'the',",                     212
"'and',",                     191
"'in',",                      118
"'he',",                      110
"'at',",                       98
"'to',",                       97
"'was',",                      94
"'of',",                       74
"'fire',",                     71
"'his',",                      69
"'opened',",                   69
"'before',",                   64
"'shot',",                     64
"'by',",                       54
"'police',",                   53
"'with',",                     51
"'on',",                       49
"'killed',",                   44
"'an',",                       42
"'after',",                    40
"'had',",                      39
"'then',",                     31
"'who',",                      31
"'three',",                    28
"'as',",                       28
"'killing',",                  27
"'two',",     

In [10]:
import numpy as np
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from gensim.models import Word2Vec


sentences = [nltk.word_tokenize(text) for text in df['mental_health_details']]
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

def average_word_vectors(words, model, key_to_index, num_features):
    feature_vector = np.zeros((num_features,), dtype="float64")
    n_words = 0
    for word in words:
        if word in key_to_index:
            n_words += 1
            feature_vector = np.add(feature_vector, model.wv.__getitem__(word))
    if n_words > 0:
        feature_vector = np.divide(feature_vector, n_words)
    return feature_vector

def sentiment_analysis(text):
    sid = SentimentIntensityAnalyzer()
    words = nltk.word_tokenize(text)
    word_vectors = average_word_vectors(words, word2vec_model, word2vec_model.wv.key_to_index, 100)
    sentiment_score = sid.polarity_scores(" ".join(words))['compound']
    if sentiment_score >= 0.05:
        return 'Positive'
    elif sentiment_score <= -0.05:
        return 'Negative'
    else:
        return 'Neutral'

df['sentiment'] = df['mental_health_details'].apply(sentiment_analysis)

In [11]:
print(df[['case','sentiment']])

                                              case sentiment
0              Nashville religious school shooting   Neutral
1               Michigan State University shooting   Neutral
2                     Half Moon Bay spree shooting   Neutral
3                    LA dance studio mass shooting  Negative
4                        Virginia Walmart shooting   Neutral
5                              LGBTQ club shooting  Negative
6                  University of Virginia shooting   Neutral
7                           Raleigh spree shooting   Neutral
8                     Greenwood Park Mall shooting   Neutral
9             Highland Park July 4 parade shooting   Neutral
10                  Church potluck dinner shooting   Neutral
11                       Concrete company shooting   Neutral
12                   Tulsa medical center shooting   Neutral
13                 Robb Elementary School massacre   Neutral
14                    Buffalo supermarket massacre  Negative
15               Sacrame

In [7]:
row_index = 66
print(df.iloc[row_index])

case                                                Umpqua Community College shooting
location                                                             Roseburg, Oregon
date                                                                        10/1/2015
summary                             [26-year-old, chris, harper, mercer, opened, f...
fatalities                                                                          9
injured                                                                             9
total_victims                                                                      18
location.1                                                                     School
age_of_shooter                                                                     26
prior_signs_mental_health_issues                                              Unclear
mental_health_details               Harper-Mercer's mother said in multiple online...
weapons_obtained_legally                              