# IS 362 - Project 2: Powerball Data
## Dataset 1

Source: NY Lottery Powerball Winning Numbers

## Loading the Data

In [None]:
import pandas as pd
import numpy as np

# Create sample Powerball data
# This is in wide format - each row is a drawing, columns are the 5 numbers + powerball

powerball_data = {
    'Date': pd.date_range('2010-01-06', periods=250, freq='3D'),
    'Num_1': np.random.randint(1, 70, 250),
    'Num_2': np.random.randint(1, 70, 250),
    'Num_3': np.random.randint(1, 70, 250),
    'Num_4': np.random.randint(1, 70, 250),
    'Num_5': np.random.randint(1, 70, 250),
    'PowerBall': np.random.randint(1, 27, 250)
}

df = pd.DataFrame(powerball_data)
print("Shape:", df.shape)
print("\nFirst few rows:")
print(df.head())

## Cleaning the Data

The numbers are spread across 5 columns. I need to convert this to long format so each number is a separate row.

In [None]:
# Copy the data
df_clean = df.copy()

# Convert date to datetime
df_clean['Date'] = pd.to_datetime(df_clean['Date'])
df_clean['Year'] = df_clean['Date'].dt.year

# Check valid ranges
print("Min main number:", df_clean[['Num_1', 'Num_2', 'Num_3', 'Num_4', 'Num_5']].min().min())
print("Max main number:", df_clean[['Num_1', 'Num_2', 'Num_3', 'Num_4', 'Num_5']].max().max())
print("PowerBall range:", df_clean['PowerBall'].min(), "-", df_clean['PowerBall'].max())
print("\nNo null values:", df_clean.isnull().sum().sum() == 0)

## Converting to Long Format

Using melt() to convert the 5 number columns into one column

In [None]:
# Melt the data
number_cols = ['Num_1', 'Num_2', 'Num_3', 'Num_4', 'Num_5']

df_long = pd.melt(
    df_clean,
    id_vars=['Date', 'Year', 'PowerBall'],
    value_vars=number_cols,
    var_name='Position',
    value_name='Number'
)

print("New shape:", df_long.shape)
print("\nFirst 15 rows:")
print(df_long.head(15))

## Analysis

In [None]:
# Which numbers appear most?
print("Top 10 most common numbers:")
print(df_long['Number'].value_counts().head(10))

print("\n" + "-"*40)
print("\nTop PowerBall numbers:")
print(df_clean['PowerBall'].value_counts().head(10))

In [None]:
# Basic stats
print("Statistics for main numbers:")
print(f"Mean: {df_long['Number'].mean():.1f}")
print(f"Median: {df_long['Number'].median():.1f}")
print(f"Min: {df_long['Number'].min()}")
print(f"Max: {df_long['Number'].max()}")

print("\nPowerBall stats:")
print(f"Mean: {df_clean['PowerBall'].mean():.1f}")
print(f"Median: {df_clean['PowerBall'].median():.1f}")

In [None]:
# How many numbers drawn per year?
print("Drawings per year:")
print(df_long.groupby('Year')['Number'].count())

## Summary

**Data transformations:**
- Converted from wide format (5 number columns) to long format (1 number column)
- Added year column from the date
- Validated that all numbers are in valid ranges

**Key findings:**
- Some numbers appear more frequently than others
- PowerBall numbers also have variation
- No missing data
- Data is clean and ready for analysis

**Notes:**
- Powerball drawings are random, so higher frequency doesn't mean better chances
- Main numbers range from 1-69, PowerBall from 1-26