## 01. Install Required Packages

In [None]:
# !pip install pandas numpy scikit-learn nltk joblib

import pandas as pd
import numpy as np
import zipfile
import nltk
import joblib
import os
import re
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.utils.multiclass import unique_labels

## 02. Download NLTK Data

In [None]:
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')

from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))

## 03. Extract Zip File

In [None]:
zip_path = '/content/archive.zip'
extract_to = '/content/data'

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_to)

extracted_files = os.listdir(extract_to)
print("Extracted files and folders:", extracted_files)

data_path = os.path.join(extract_to, 'data')
if os.path.exists(data_path):
    print("Contents of 'data' folder:", os.listdir(data_path))
else:
    print("'data' folder not found inside the archive.")

Extracted files and folders: ['complete.csv', 'scrubbed.csv']
'data' folder not found inside the archive.


## 04. Define State Abbreviations

In [None]:
STATE_FULL_NAMES = {
    'al': 'Alabama', 'ak': 'Alaska', 'az': 'Arizona', 'ar': 'Arkansas', 'ca': 'California',
    'co': 'Colorado', 'ct': 'Connecticut', 'de': 'Delaware', 'fl': 'Florida', 'ga': 'Georgia',
    'hi': 'Hawaii', 'id': 'Idaho', 'il': 'Illinois', 'in': 'Indiana', 'ia': 'Iowa', 'ks': 'Kansas',
    'ky': 'Kentucky', 'la': 'Louisiana', 'me': 'Maine', 'md': 'Maryland', 'ma': 'Massachusetts',
    'mi': 'Michigan', 'mn': 'Minnesota', 'ms': 'Mississippi', 'mo': 'Missouri', 'mt': 'Montana',
    'ne': 'Nebraska', 'nv': 'Nevada', 'nh': 'New Hampshire', 'nj': 'New Jersey', 'nm': 'New Mexico',
    'ny': 'New York', 'nc': 'North Carolina', 'nd': 'North Dakota', 'oh': 'Ohio', 'ok': 'Oklahoma',
    'or': 'Oregon', 'pa': 'Pennsylvania', 'ri': 'Rhode Island', 'sc': 'South Carolina',
    'sd': 'South Dakota', 'tn': 'Tennessee', 'tx': 'Texas', 'ut': 'Utah', 'vt': 'Vermont',
    'va': 'Virginia', 'wa': 'Washington', 'wv': 'West Virginia', 'wi': 'Wisconsin', 'wy': 'Wyoming',
    'dc': 'District of Columbia', 'ab': 'Alberta', 'bc': 'British Columbia', 'mb': 'Manitoba',
    'nb': 'New Brunswick', 'nl': 'Newfoundland and Labrador', 'ns': 'Nova Scotia',
    'nt': 'Northwest Territories', 'nu': 'Nunavut', 'on': 'Ontario', 'pe': 'Prince Edward Island',
    'qc': 'Quebec', 'sk': 'Saskatchewan', 'yt': 'Yukon', 'nf': 'Newfoundland and Labrador',
    'pq': 'Quebec'
}

## 05.  Define UFO Sighting Predictor Class

In [None]:
class UFOSightingPredictor:

    def __init__(self, model_path="ufo_sighting_model.joblib"):
        self.model_path = model_path
        self.model = None
        self.tfidf = None
        self.label_encoder = None
        self._load_model()

    def _clean_text(self, text):
        text = re.sub(r'[^a-zA-Z\s]', '', text, re.I | re.A).lower()
        tokens = text.split()
        tokens = [word for word in tokens if word not in STOPWORDS]
        return ' '.join(tokens)

    def _load_and_preprocess_data(self, zip_path="archive.zip", data_file="data/scrubbed.csv"):
        print("Loading and preprocessing data...")
        df = pd.read_csv(data_file, low_memory=False)
        df = df.dropna(subset=['datetime', 'state', 'duration (seconds)', 'comments'])

        df['datetime'] = pd.to_datetime(df['datetime'], errors='coerce')
        df['duration (seconds)'] = pd.to_numeric(df['duration (seconds)'], errors='coerce')
        df = df.dropna(subset=['datetime', 'duration (seconds)'])

        df['comments'] = df['comments'].apply(self._clean_text)
        df['year'] = df['datetime'].dt.year
        df['month'] = df['datetime'].dt.month
        df['hour'] = df['datetime'].dt.hour
        df['weekday'] = df['datetime'].dt.weekday

        self.label_encoder = LabelEncoder()
        df['state_label'] = self.label_encoder.fit_transform(df['state'])

        self.tfidf = TfidfVectorizer(max_features=500)
        X_text = self.tfidf.fit_transform(df['comments']).toarray()
        X_time = df[['year', 'month', 'hour', 'weekday', 'duration (seconds)']].values
        X = np.concatenate([X_time, X_text], axis=1)
        y = df['state_label']
        return X, y

    def train(self):
        X, y = self._load_and_preprocess_data()
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        print("Training RandomForest model...")
        self.model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
        self.model.fit(X_train, y_train)

        print("\n=== Model Evaluation ===")
        y_pred = self.model.predict(X_test)
        used_labels = unique_labels(y_test, y_pred)
        used_target_names = self.label_encoder.inverse_transform(used_labels)
        print(classification_report(y_test, y_pred, labels=used_labels, target_names=used_target_names))

        self._save_model()

    def _save_model(self):
        print(f"Saving model to {self.model_path}...")
        payload = {'model': self.model, 'tfidf': self.tfidf, 'label_encoder': self.label_encoder}
        joblib.dump(payload, self.model_path)
        print("Model saved successfully.")

    def _load_model(self):
        if os.path.exists(self.model_path):
            print(f"Loading pre-trained model from {self.model_path}...")
            payload = joblib.load(self.model_path)
            self.model = payload['model']
            self.tfidf = payload['tfidf']
            self.label_encoder = payload['label_encoder']
            print("Model loaded successfully.")
        else:
            print("No pre-trained model found.")

    def predict(self, description, sighting_time, duration_sec):
        if not all([self.model, self.tfidf, self.label_encoder]):
            print("❌ Error: Model is not trained or loaded.")
            return

        try:
            dt = datetime.strptime(sighting_time, "%Y-%m-%d %H:%M")
        except ValueError:
            print("❌ Invalid datetime format. Please use 'YYYY-MM-DD HH:MM'.")
            return

        try:
            duration_sec = float(duration_sec)
        except ValueError:
            print("❌ Invalid duration. Please enter a number for seconds.")
            return

        year, month, hour, weekday = dt.year, dt.month, dt.hour, dt.weekday()
        clean_text = self._clean_text(description)
        text_vec = self.tfidf.transform([clean_text]).toarray()
        time_features = np.array([year, month, hour, weekday, duration_sec]).reshape(1, -1)
        input_vec = np.concatenate([time_features, text_vec], axis=1)

        pred_label = self.model.predict(input_vec)
        pred_abbr = self.label_encoder.inverse_transform(pred_label)[0]
        pred_full_name = STATE_FULL_NAMES.get(pred_abbr.lower(), pred_abbr.upper())

        print("\n" + "="*40)
        print(f"🔮 Predicted Location: {pred_full_name}")
        print("="*40 + "\n")

    def start_interactive_mode(self):
        if not self.model:
            print("Please train the model first.")
            return

        print("\n=== 🛸 Live UFO Sighting Prediction ===")
        print("Enter sighting details to predict the location. Type 'quit' at any time to exit.")

        while True:
            description = input("➡️ Enter sighting description: ")
            if description.lower() == 'quit':
                break

            sighting_time = input("➡️ Enter sighting time (YYYY-MM-DD HH:MM): ")
            if sighting_time.lower() == 'quit':
                break

            duration_sec = input("➡️ Enter duration in seconds (e.g., 180): ")
            if duration_sec.lower() == 'quit':
                break

            self.predict(description, sighting_time, duration_sec)


## 06. Run the Predictor

In [None]:
if __name__ == "__main__":
    predictor = UFOSightingPredictor(model_path="ufo_sighting_model.joblib")
    if not predictor.model:
        print("Training a new model as no pre-trained model was found.")
        predictor.train()
    predictor.start_interactive_mode()