## generate_data.py

In [1]:
import csv
import random
from datetime import datetime, timedelta
import json

# Generate data for 50 characters
NUM_ROWS = 1000

# Create the CSV file
OUTPUT_FILE = "troop_movements.csv"


def choose_a_side(home_world):
    """
    Randomly choose an empire or resistance side based on the likelihood 
    that someone from that world would join the rebel alliance.
    Args:
        home_world (dict): The home world data for the character.
    Returns:
        str: The empire or resistance side.
    """
    if home_world["rebel_likelihood"] > random.random():
        return "resistance"
    else:
        return "empire"


# Load home world data from JSON file
with open("home_worlds.json") as json_file:
    home_worlds = json.load(json_file)

# Generate data rows
data_rows = []
for i in range(1, NUM_ROWS + 1):
    # Generate random values for each column
    timestamp = datetime.now() - timedelta(seconds=i)
    unit_id = i
    unit_type = random.choice(
        ["stormtrooper", "tie_fighter", "at-st", "x-wing",
            "resistance_soldier", "at-at", "tie_silencer", "unknown"]
    )
    location_x = random.randint(1, 10)
    location_y = random.randint(1, 10)
    destination_x = random.randint(1, 10)
    destination_y = random.randint(1, 10)

    # Select a random home world from the available options
    home_world = random.choice(home_worlds)
    home_world_name = home_world["name"]
    empire_or_resistance = choose_a_side(home_world)

    # Create the data row
    data_row = [
        timestamp.strftime("%Y-%m-%d %H:%M:%S"),
        unit_id,
        unit_type,
        empire_or_resistance,
        location_x,
        location_y,
        destination_x,
        destination_y,
        home_world_name,
    ]

    # Add the data row to the list
    data_rows.append(data_row)

# Write the data to the CSV file
with open(OUTPUT_FILE, "w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(
        ["timestamp", "unit_id", "unit_type", "empire_or_resistance", "location_x", "location_y", "destination_x",
         "destination_y", "homeworld"]
    )
    writer.writerows(data_rows)

print("Data generation complete.")


Data generation complete.


### Display troop_movements.csv as DataFrame

In [2]:
import pandas as pd
df = pd.read_csv("troop_movements.csv")
df
    

Unnamed: 0,timestamp,unit_id,unit_type,empire_or_resistance,location_x,location_y,destination_x,destination_y,homeworld
0,2024-02-07 08:33:35,1,tie_fighter,empire,5,10,5,4,Serenno
1,2024-02-07 08:33:34,2,stormtrooper,empire,2,2,2,1,Corellia
2,2024-02-07 08:33:33,3,tie_fighter,resistance,6,8,2,7,Shili
3,2024-02-07 08:33:32,4,at-st,empire,6,1,1,6,Iridonia
4,2024-02-07 08:33:31,5,resistance_soldier,resistance,4,6,8,7,Rodia
...,...,...,...,...,...,...,...,...,...
995,2024-02-07 08:17:00,996,stormtrooper,resistance,2,7,2,7,Ryloth
996,2024-02-07 08:16:59,997,x-wing,empire,6,4,1,10,Mirial
997,2024-02-07 08:16:58,998,at-st,resistance,10,1,4,8,Iktotch
998,2024-02-07 08:16:57,999,tie_fighter,empire,6,2,9,3,Eriadu


### Part 2: Build a prediction model

In [19]:
# Create grouped data showing counts of empire vs resistance.
er = pd.DataFrame({"empire_or_resistance":[], "count":[]})
er = df.groupby("empire_or_resistance").size().reset_index(name="count")
print(homeworld_counts.head())

er

ValueError: Length mismatch: Expected axis has 1 elements, new values have 2 elements

In [4]:
# Create grouped data showing counts of characters by homeworld
homeworld_counts = df.groupby("homeworld").size().reset_index(name="count")
print(homeworld_counts.head())


     homeworld  count
0     Alderaan     23
1  Aleen Minor     30
2   Bestine IV     18
3        Cerea     21
4     Champala     28


In [5]:
# Created grouped data showing counts of characters by unit_type
# Create grouped data showing counts of characters by homeworld
homeworld_counts = df.groupby("unit_type").size().reset_index(name="count")
print(homeworld_counts.head())


            unit_type  count
0               at-at    142
1               at-st    137
2  resistance_soldier    135
3        stormtrooper    135
4         tie_fighter    129


In [6]:
# Engineer a new feature called is_resistance with a True or False value based on empire_or_resiatance
df['is_resistance'] = df["empire_or_resistance"].map(lambda x: x == "resistance")
df

Unnamed: 0,timestamp,unit_id,unit_type,empire_or_resistance,location_x,location_y,destination_x,destination_y,homeworld,is_resistance
0,2024-02-07 08:33:35,1,tie_fighter,empire,5,10,5,4,Serenno,False
1,2024-02-07 08:33:34,2,stormtrooper,empire,2,2,2,1,Corellia,False
2,2024-02-07 08:33:33,3,tie_fighter,resistance,6,8,2,7,Shili,True
3,2024-02-07 08:33:32,4,at-st,empire,6,1,1,6,Iridonia,False
4,2024-02-07 08:33:31,5,resistance_soldier,resistance,4,6,8,7,Rodia,True
...,...,...,...,...,...,...,...,...,...,...
995,2024-02-07 08:17:00,996,stormtrooper,resistance,2,7,2,7,Ryloth,True
996,2024-02-07 08:16:59,997,x-wing,empire,6,4,1,10,Mirial,False
997,2024-02-07 08:16:58,998,at-st,resistance,10,1,4,8,Iktotch,True
998,2024-02-07 08:16:57,999,tie_fighter,empire,6,2,9,3,Eriadu,False


In [18]:
# Create a bar plot using Seaborn showing Empire vs Resistance distribution
import seaborn as sns
import matplotlib.pyplot as plt

#sns.barplot(x = 'empire_or_resistance',y='count',data = df)
er.columns

Index(['count'], dtype='object')

In [8]:
# Create a prediction model using sklearn.tree.DecisionTreeClassifier that predicts if a character is joining either the Empire or the Resistance based on their homeworld and unit_type.


In [9]:
# Convert categorical features to numeric using pd.get_dummies.

In [10]:
# Create a bar plot that shows feature importance.
# Example code to get feature importance:
# Get feature importances
importances = model.feature_importances_

# Create a DataFrame to hold the feature importances
feature_importances = pd.DataFrame({'Feature': X_encoded.columns, 'Importance': importances})


NameError: name 'model' is not defined

In [None]:
# Save to model as a pickle file named trained_model.pkl

### Part 3: Use the trained model with “real” data

Load data from troop_movements10m.csv (see Guided Project zip for file). This file contains 10 million records to be predicted. 
This data must be cleaned up a bit before it can be used:

⦁	Some unit_type records have a value of invalid_unit. Replace that with unknown.

⦁	Some location_x and location_Y values are missing. Use the ffill method to fill.

⦁	Save the clean data into a Parquet file named troop_movements10m.parquet.

⦁	You need to install pyarrow and fastparquet to support saving to a Parquet file.

pip install pyarrow

pip install fastparquet

Load the pickled model and load the data from the Parquet file into a data frame. Run the data through the model.
