In [1]:
!pip install pandas stanfordnlp gensim nltk numpy plotly

Collecting stanfordnlp
  Downloading stanfordnlp-0.2.0-py3-none-any.whl.metadata (8.6 kB)
Downloading stanfordnlp-0.2.0-py3-none-any.whl (158 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m158.8/158.8 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: stanfordnlp
Successfully installed stanfordnlp-0.2.0


In [11]:
import pandas as pd
from stanfordnlp.server import CoreNLPClient
from gensim.models import KeyedVectors
from nltk.tokenize import word_tokenize
import numpy as np
from gensim.similarities import SoftCosineSimilarity, SparseTermSimilarityMatrix
from gensim.corpora import Dictionary
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import plotly.express as px
import plotly.graph_objects as go

In [12]:
def clean_data_corrected(df):
    cols_to_clean = ['Model', 'ADAS/ADS System Version', 'State or Local Permit', 'Operating Entity', 'City', 'State',
                     'Roadway Type', 'Roadway Surface', 'Lighting', 'CP Pre-Crash Movement', 'SV Pre-Crash Movement', 'Narrative']
    for col in cols_to_clean:
        df[col] = df[col].str.strip().str.lower()
    df['Incident Date'] = pd.to_datetime(df['Incident Date'], errors='coerce')
    df[cols_to_clean] = df[cols_to_clean].fillna('unknown')
    df['Model Year'] = pd.to_numeric(df['Model Year'], errors='coerce').fillna(df['Model Year'].median(skipna=True))
    df['Posted Speed Limit (MPH)'] = pd.to_numeric(df['Posted Speed Limit (MPH)'], errors='coerce').fillna(df['Posted Speed Limit (MPH)'].median(skipna=True))
    return df

In [4]:
def process_text_with_corenlp(text, client):
    ann = client.annotate(text)
    return ' '.join([token.word for sentence in ann.sentence for token in sentence.token])


In [5]:
def process_narratives(narratives):
    client = CoreNLPClient(annotators=['tokenize','ssplit','pos','lemma','ner', 'parse', 'depparse'], timeout=30000, memory='16G')
    processed_texts = [process_text_with_corenlp(narrative, client) for narrative in narratives]
    client.stop()
    return processed_texts

In [6]:
def load_glove_model(glove_path):
    return KeyedVectors.load_word2vec_format(glove_path, binary=False)

In [13]:
def narrative_to_vec(narrative, glove_model):
    words = word_tokenize(narrative.lower())
    return np.mean([glove_model[word] for word in words if word in glove_model], axis=0)

def setup_similarity_index(categories, glove_model):
    dictionary = Dictionary([word_tokenize(desc.lower()) for desc in categories.values()])
    corpus = [dictionary.doc2bow(word_tokenize(desc.lower())) for desc in categories.values()]
    similarity_matrix = SparseTermSimilarityMatrix(glove_model, dictionary)
    return SoftCosineSimilarity(corpus, similarity_matrix), dictionary

def categorize_narratives(narrative_vectors, similarity_index, dictionary, categories):
    categorized_results = []
    for narrative_vec in narrative_vectors:
        query_bow = dictionary.doc2bow(word_tokenize(narrative_vec))
        similarities = similarity_index[query_bow]
        categorized_results.append(list(categories.keys())[similarities[0][0]])
    return categorized_results

In [14]:
def mainFunc(file_path):
    data = pd.read_csv(file_path)

    columns_to_extract = [
        'Model', 'Model Year', 'ADAS/ADS System Version', 'State or Local Permit', 'Operating Entity',
        'Incident Date', 'City', 'State', 'Roadway Type', 'Roadway Surface', 'Posted Speed Limit (MPH)',
        'Lighting', 'CP Pre-Crash Movement', 'SV Pre-Crash Movement', 'Narrative'
    ]

    filtered_data = data[columns_to_extract]
    filtered_data.loc[:, 'Narrative'] = filtered_data['Narrative'].fillna("unknown")


    categories_dict = {
        "Object/Obstacle avoidance": "crashes object obstacle pedestrians animals stopped vehicles tires",
        "Head-on collision": "crashes head-on vehicles lost control",
    }
    vectorizer = TfidfVectorizer()
    category_vectors = vectorizer.fit_transform(categories_dict.values())
    narrative_vector = vectorizer.transform(filtered_data['Narrative'])
    similarities = cosine_similarity(narrative_vector, category_vectors)
    filtered_data['Category'] = [list(categories_dict.keys())[index] for index in similarities.argmax(axis=1)]




    new_file_path = '/content/data/Categorized_Incident_Reports.csv'
    filtered_data.to_csv(new_file_path, index=False)


In [15]:
def create_visualization_plotly(x, y, data, title, xlabel, ylabel):
    if x == 'Category' and y == 'City':

        size = data.groupby([x, y]).size().reset_index(name='Count')
        fig = px.scatter(size, x=x, y=y, size='Count', title=title)
        fig.update_layout(xaxis_title=xlabel, yaxis_title=ylabel)
    elif x == 'Category' and y == 'Lighting':

        count_data = data.groupby(x)[y].value_counts().unstack(fill_value=0)
        fig = px.bar(count_data, barmode='stack', title=title)
        fig.update_layout(xaxis_title=xlabel, yaxis_title=ylabel)
    elif x == 'Posted Speed Limit (MPH)' and y == 'Roadway Surface':
        pivot_table = data.pivot_table(index=x, columns=y, aggfunc='size', fill_value=0)
        fig = go.Figure(data=go.Heatmap(
            z=pivot_table.values,
            x=pivot_table.columns,
            y=pivot_table.index,
            colorscale='YlGnBu'
        ))
        fig.update_layout(title=title, xaxis_title=xlabel, yaxis_title=ylabel)
    elif x == 'State' and y == 'Category':

        count_data = data.groupby(x)[y].value_counts().unstack(fill_value=0)
        fig = px.bar(count_data, barmode='stack', title=title)
        fig.update_layout(xaxis_title=xlabel, yaxis_title=ylabel)
    elif x == 'State' and y == 'Operating Entity':



        fig = px.histogram(data, x=x, color=y, title=title)
        fig.update_layout(xaxis_title=xlabel, yaxis_title=ylabel)
    elif x == 'Category' and y == 'Roadway Type':

        fig = px.histogram(data, x=x, color=y, title=title)
        fig.update_layout(xaxis_title=xlabel, yaxis_title=ylabel)

    fig.show()


In [16]:
file_path = '/content/data/SGO-2021-01_Incident_Reports_ADS.csv'
data = pd.read_csv(file_path)
cleaned_data = clean_data_corrected(data)


cleaned_file_path = '/content/data/Cleaned_ADS_Incident_Reports.csv'
cleaned_data.to_csv(cleaned_file_path, index=False)


mainFunc(cleaned_file_path)


data = pd.read_csv('/content/data/Categorized_Incident_Reports.csv')

fig = px.bar(data, x='Category', title='Frequency of Incident Categories')
fig.update_layout(xaxis_title='Category', yaxis_title='Count', xaxis={'categoryorder': 'total descending'})

fig.write_html("incident_categories_bar_chart.html")


roadway_counts = data['Roadway Surface'].value_counts()
fig = px.pie(values=roadway_counts, names=roadway_counts.index, title='Distribution of Roadway Surfaces')

fig.write_html("roadway_surfaces_pie_chart.html")


fig = px.histogram(data, x='Posted Speed Limit (MPH)', nbins=15, title='Histogram of Posted Speed Limits')
fig.update_layout(xaxis_title='Speed Limit (MPH)', yaxis_title='Frequency')
fig.write_html("speed_limits_histogram.html")
fig = px.box(data, x='Category', y='Model Year', title='Model Year Distribution by Category')
fig.update_layout(xaxis_title='Category', yaxis_title='Model Year')

fig.write_html("model_year_boxplot.html")

  df['Incident Date'] = pd.to_datetime(df['Incident Date'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['Category'] = [list(categories_dict.keys())[index] for index in similarities.argmax(axis=1)]
