In [None]:
#Importing packages

import numpy as np
import pandas as pd
import nltk
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
import re
import gensim
from sklearn.model_selection import train_test_split
import geopandas as gpd
import plotly.express as px
import folium
import matplotlib
!pip install mapclassify
import mapclassify
!pip install geodatasets
from geodatasets import get_path
import plotly.express as px
import matplotlib.pyplot as plt

In [None]:
#Reading the Data into Python
data = pd.read_csv('/content/ufo_data.csv', on_bad_lines = 'skip')
data.head()

## Preliminary Analysis

In [None]:
#Preliminary Analysis

#Getting the names of the columns
column_names = data.columns
print("Column Names:", column_names)

#making the comments column into a readable format
data['comments'] = data['comments'].astype(str)

# Get column data types
column_types = data.dtypes
print("Column Types:\n", column_types)

In [None]:
fig = px.scatter_geo(data,
                     lat='latitude',
                     lon='longitude',
                     hover_name='comments',  # Displays comments when you hover over a point,
                     title='UFO Sightings Worldwide')

fig.update_layout(showlegend=False)
fig.show()

In [None]:
#Creating a plot of the number of UFO sightings by year
data['datetime'] = pd.to_datetime(data['datetime'], errors='coerce')

# Drop rows where datetime could not be parsed
data = data.dropna(subset=['datetime'])

# Aggregate data by year to count sightings per year
data['year'] = data['datetime'].dt.year
ufo_sightings_by_year = data.groupby('year').size()

# Plot the data
plt.figure(figsize=(12, 6))
plt.plot(ufo_sightings_by_year.index, ufo_sightings_by_year.values, marker='o')
plt.title('UFO Sightings by Year')
plt.xlabel('Year')
plt.ylabel('Number of Sightings')
plt.grid(True)
plt.show()

## Performing Sentiment Analysis using VADER

In [None]:
# Download necessary NLTK resources
nltk.download('vader_lexicon')
nltk.download('stopwords')


def preprocess_text(text):
    text = re.sub(r'\W', ' ', str(text))
    text = text.lower()
    text = re.sub(r'\s+', ' ', text, flags=re.I)
    text = re.sub(r'^b\s+', '', text)
    return text


data['processed_comments'] = data['comments'].apply(preprocess_text)



sia = SentimentIntensityAnalyzer()

# Function to get the compound score
def get_sentiment_score(text):
    score = sia.polarity_scores(text)
    return score['compound']  # Returning the compound score

# Apply the function to processed comments
data['VADER_sentiment_score'] = data['processed_comments'].apply(get_sentiment_score)

data.head()

In [None]:
# Clean and convert the latitude and longitude to numeric
data['latitude'] = pd.to_numeric(data['latitude'], errors='coerce')
data['longitude'] = pd.to_numeric(data['longitude'], errors='coerce')

# Drop any rows with NaN values in latitude or longitude
data.dropna(subset=['latitude', 'longitude'], inplace=True)

def sentiment_category(score):
    if score >= 0:
        return 'positive'
    elif score < 0:
        return 'negative'


data['VADER_sentiment_category'] = data['VADER_sentiment_score'].apply(sentiment_category)

# Create a scatter plot on a world map
fig3 = px.scatter_geo(data,
                     lat='latitude',
                     lon='longitude',
                     color='VADER_sentiment_category',  # This will use the sentiment_category column for color
                     hover_name='comments',  # Displays comments when you hover over a point
                     hover_data=['VADER_sentiment_score'],  # Also display the sentiment score on hover
                     opacity = .5,
                     projection='natural earth',
                     color_discrete_map={'positive': 'green','negative': 'red'},  # Custom color mapping
                     title='UFO Sightings Sentiment Worldwide')

fig3.update_layout(showlegend=True)
fig3.show()

## Performing Sentiment Analysis using Sentiment 140 data to create an RNN model

In [17]:
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
import matplotlib.pyplot as plt

#loading the sentiment 140 dataset
sentiment140_data = pd.read_csv('/content/Sentiment140.csv', encoding='ISO-8859-1', header=None, skiprows=[610683])
sentiment140_data.columns = ['polarity', 'tweet_id', 'date', 'query', 'username', 'text']


sentiment_mapping = {0: 0, 2: 1, 4: 2}
sentiment140_data['polarity'] = sentiment140_data['polarity'].map(sentiment_mapping)

#Preprocessing the data
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(sentiment140_data['text'].values)
sequences = tokenizer.texts_to_sequences(sentiment140_data['text'].values)
maxlen = 100
padded_sequences = pad_sequences(sequences, maxlen=maxlen)

labels = sentiment140_data['polarity'].values

x_train, x_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)


print(f"x_train shape: {x_train.shape}")
print(f"x_test shape: {x_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")


# Build the RNN model
embedding_dim = 32
rnn_units = 64
num_classes = 3

model = Sequential([
    Embedding(input_dim=10000, output_dim=embedding_dim, input_length=maxlen),
    LSTM(rnn_units, return_sequences=False),
    Dropout(0.5),
    Dense(num_classes, activation='softmax')
])

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
batch_size = 128
epochs = 5

history = model.fit(x_train, y_train, epochs=epochs, batch_size=batch_size, validation_split=0.2, verbose=1)

# Evaluate the model
test_loss, test_acc = model.evaluate(x_test, y_test)
print(f'Test Accuracy: {test_acc:.2f}')




In [None]:
#Now that a RNN Model has been created, we can apply it to our UFO dataset; need to do some tuning so it is more relevant to our dataset
comments = data['comments'].astype(str)

# Convert the comments to sequences and pad them
ufo_sequences = tokenizer.texts_to_sequences(comments)
ufo_padded_sequences = pad_sequences(ufo_sequences, maxlen=maxlen)

# Predict sentiment using the trained model
model_predictions = model.predict(ufo_padded_sequences)

# Get the predicted labels
predicted_labels = np.argmax(model_predictions, axis=1)


# Map numeric labels to sentiment labels
sentiment_labels = {0: "negative", 1: "neutral", 2: "positive"}
data['RNN_predicted_sentiment'] = [sentiment_labels[label] for label in predicted_labels]

# Display the head of the dataset to check
data.head()


In [None]:
fig5 = px.scatter_geo(data,
                     lat='latitude',
                     lon='longitude',
                     color='RNN_predicted_sentiment',  # This will use the sentiment_category column for color
                     hover_name='comments',  # Displays comments when you hover over a point
                     color_discrete_map={'positive': 'green','negative': 'red'},  # Custom color mapping
                     projection='natural earth',
                     opacity = .5,
                     title='UFO Sightings Sentiment Worldwide')

fig5.update_layout(showlegend=True)
fig5.show()

In [None]:
#Printing out the number of negative and positive sentiments
print(f"Number of sentiment entries for VADER:{data['VADER_sentiment_category'].value_counts()}")
print()

print(f"Number of sentiment entries for RNN:{data['RNN_predicted_sentiment'].value_counts()}")
print()

In [None]:
#Prints out the number of sentiments that are the same between VADER and the RNN prediction
vader_rnn_matches = (data['VADER_sentiment_category'] == data['RNN_predicted_sentiment']).sum()

print(f"Number of matching sentiment entries: {vader_rnn_matches}")


In [None]:
# Graphing sentiment over time for the RNN
data.set_index('datetime', inplace=True)

# Resample the data annually and count sentiments
rnn_sentiment_counts = data.resample('A').RNN_predicted_sentiment.value_counts().unstack()

# Plot the data
rnn_sentiment_counts.plot(kind='line', figsize=(10, 5))
plt.title('Positive and Negative RNN Sentiment Encounters Per Year')
plt.ylabel('Number of Encounters')
plt.xlabel('Year')
plt.legend(title='Sentiment')
plt.grid(True)
plt.show()

#graphing sentiment over time for VADER
vader_sentiment_counts = data.resample('A').VADER_sentiment_category.value_counts().unstack()

# Plot the data
vader_sentiment_counts.plot(kind='line', figsize=(10, 5))
plt.title('Positive and Negative VADER Sentiment Encounters Per Year')
plt.ylabel('Number of Encounters')
plt.xlabel('Year')
plt.legend(title='Sentiment')
plt.grid(True)
plt.show()