## generate_data.py

In [2]:
import csv
import random
from datetime import datetime, timedelta
import json
import pandas as pd

# Generate data for 50 characters
NUM_ROWS = 1000

# Create the CSV file
OUTPUT_FILE = "troop_movements.csv"


def choose_a_side(home_world):
    """
    Randomly choose an empire or resistance side based on the likelihood 
    that someone from that world would join the rebel alliance.
    Args:
        home_world (dict): The home world data for the character.
    Returns:
        str: The empire or resistance side.
    """
    if home_world["rebel_likelihood"] > random.random():
        return "resistance"
    else:
        return "empire"


# Load home world data from JSON file
with open("home_worlds.json") as json_file:
    home_worlds = json.load(json_file)

# Generate data rows
data_rows = []
for i in range(1, NUM_ROWS + 1):
    # Generate random values for each column
    timestamp = datetime.now() - timedelta(seconds=i)
    unit_id = i
    unit_type = random.choice(
        ["stormtrooper", "tie_fighter", "at-st", "x-wing",
            "resistance_soldier", "at-at", "tie_silencer", "unknown"]
    )
    location_x = random.randint(1, 10)
    location_y = random.randint(1, 10)
    destination_x = random.randint(1, 10)
    destination_y = random.randint(1, 10)

    # Select a random home world from the available options
    home_world = random.choice(home_worlds)
    home_world_name = home_world["name"]
    empire_or_resistance = choose_a_side(home_world)

    # Create the data row
    data_row = [
        timestamp.strftime("%Y-%m-%d %H:%M:%S"),
        unit_id,
        unit_type,
        empire_or_resistance,
        location_x,
        location_y,
        destination_x,
        destination_y,
        home_world_name,
    ]

    # Add the data row to the list
    data_rows.append(data_row)

# Write the data to the CSV file
with open(OUTPUT_FILE, "w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(
        ["timestamp", "unit_id", "unit_type", "empire_or_resistance", "location_x", "location_y", "destination_x",
         "destination_y", "homeworld"]
    )
    writer.writerows(data_rows)

print("Data generation complete.")


Data generation complete.


### Display troop_movements.csv as DataFrame

In [4]:
df = pd.read_csv("troop_movements.csv")
df
    

Unnamed: 0,timestamp,unit_id,unit_type,empire_or_resistance,location_x,location_y,destination_x,destination_y,homeworld
0,2024-02-06 16:23:24,1,stormtrooper,resistance,2,5,6,10,Sullust
1,2024-02-06 16:23:23,2,x-wing,empire,3,5,3,9,Trandosha
2,2024-02-06 16:23:22,3,resistance_soldier,resistance,5,9,1,4,Rodia
3,2024-02-06 16:23:21,4,unknown,resistance,2,9,9,10,Iridonia
4,2024-02-06 16:23:20,5,at-at,resistance,6,5,10,3,Chandrila
...,...,...,...,...,...,...,...,...,...
995,2024-02-06 16:06:49,996,stormtrooper,empire,9,6,4,6,Glee Anselm
996,2024-02-06 16:06:48,997,x-wing,empire,3,3,1,6,Aleen Minor
997,2024-02-06 16:06:47,998,x-wing,resistance,4,8,8,5,Rodia
998,2024-02-06 16:06:46,999,stormtrooper,resistance,1,4,8,7,Dagobah


### Part 2: Build a prediction model

In [None]:
# Create grouped data showing counts of empire vs resistance.

In [5]:
# Create grouped data showing counts of characters by homeworld

In [None]:
# Created grouped data showing counts of characters by unit_type

In [None]:
# ⦁	Engineer a new feature called is_resistance with a True or False value based on empire_or_resiatance

In [None]:
# Create a bar plot using Seaborn showing Empire vs Resistance distribution

In [None]:
# Create a prediction model using sklearn.tree.DecisionTreeClassifier that predicts if a character is joining either the Empire or the Resistance based on their homeworld and unit_type.


In [None]:
# Convert categorical features to numeric using pd.get_dummies.

In [None]:
# Create a bar plot that shows feature importance.
# Example code to get feature importance:
# Get feature importances
importances = model.feature_importances_

# Create a DataFrame to hold the feature importances
feature_importances = pd.DataFrame({'Feature': X_encoded.columns, 'Importance': importances})


In [None]:
# Save to model as a pickle file named trained_model.pkl

### Part 3: Use the trained model with “real” data

Load data from troop_movements10m.csv (see Guided Project zip for file). This file contains 10 million records to be predicted. 
This data must be cleaned up a bit before it can be used:

⦁	Some unit_type records have a value of invalid_unit. Replace that with unknown.

⦁	Some location_x and location_Y values are missing. Use the ffill method to fill.

⦁	Save the clean data into a Parquet file named troop_movements10m.parquet.

⦁	You need to install pyarrow and fastparquet to support saving to a Parquet file.

pip install pyarrow

pip install fastparquet

Load the pickled model and load the data from the Parquet file into a data frame. Run the data through the model.
