# PyEgen Complete Feature Demonstration and Testing Guide

In [None]:
# PyEgen Complete Feature Demonstration and Testing
# Python version of Stata egen command detailed usage guide

import pandas as pd
import numpy as np
import pyegen as egen
import warnings
warnings.filterwarnings('ignore')

print("PyEgen Version:", egen.__version__ if hasattr(egen, '__version__') else 'Installed')
print("Pandas Version:", pd.__version__)
print("NumPy Version:", np.__version__)
print("="*50)

# 1. Environment Setup and Data Preparation

In [None]:
# 1.1 PyEgen Environment Configuration

import pandas as pd
import numpy as np
import pyegen as egen
import warnings
warnings.filterwarnings('ignore')

print("PyEgen Version:", egen.__version__ if hasattr(egen, '__version__') else 'Installed')
print("Pandas Version:", pd.__version__)
print("NumPy Version:", np.__version__)
print("="*50)

In [None]:
# 1.2 Create Test Datasets
print("Creating Test Datasets")
print("="*30)

# Basic test data
df_basic = pd.DataFrame({
    'group':  ['A', 'A', 'A', 'B', 'B', 'B', 'C', 'C', 'C'],
    'value1': [10, 20, 20, 30, 30, 30, 40, 50, 50],  # with duplicate values
    'value2': [1,  2,  3,  4,  5,  6,  7,  8,  9],
    'value3': [100,200,300,400,500,600,700,800,900],
    'id':     range(1, 10)
})

print("Basic test data:")
print(df_basic)
print()

# Test data with missing values
df_missing = pd.DataFrame({
    'var1': [1, 2, np.nan, 4, 5, np.nan, 7, 8, 9],
    'var2': [10, np.nan, 30, 40, np.nan, 60, 70, 80, np.nan],
    'var3': [100, 200, 300, np.nan, 500, 600, np.nan, 800, 900],
    'group': ['X', 'X', 'X', 'Y', 'Y', 'Y', 'Z', 'Z', 'Z']
})

print("Test data with missing values:")
print(df_missing)
print()

# Country GDP data example
df_country = pd.DataFrame({
    'country': ['USA', 'USA', 'USA', 'CHN', 'CHN', 'CHN', 'DEU', 'DEU', 'DEU'],
    'year': [2020, 2021, 2022, 2020, 2021, 2022, 2020, 2021, 2022],
    'gdp': [21.43, 22.32, 23.32, 14.72, 17.73, 17.89, 3.84, 4.26, 4.26],
    'population': [331, 332, 333, 1439, 1412, 1425, 83, 83, 83]
})

print("Country GDP data:")
print(df_country)

# 2. Basic Function Demonstration - rank() Function

In [None]:
print("rank() Function Demonstration")
print("="*30)
print("Stata equivalent command: egen newvar = rank(value1)")
print()

# Basic ranking
df_rank = df_basic.copy()
df_rank['rank_default'] = egen.rank(df_rank['value1']) # default method is average, might be a bug
df_rank['rank_min'] = egen.rank(df_rank['value1'], method='min')
df_rank['rank_max'] = egen.rank(df_rank['value1'], method='max')
df_rank['rank_first'] = egen.rank(df_rank['value1'], method='first')

print("Comparison of different ranking methods:")
print(df_rank[['value1', 'rank_default', 'rank_min', 'rank_max', 'rank_first']])
print()

In [None]:
# Ranking by group
df_rank['rank_by_group'] = df_rank.groupby('group')['value1'].rank()
print("Ranking by group:")
print(df_rank[['group', 'value1', 'rank_by_group']])
print()

In [None]:
# Handling tied values in ranking
df_ties = pd.DataFrame({
    'value': [10, 20, 20, 30, 30, 30, 40]
})
df_ties['rank_average'] = egen.rank(df_ties['value'], method='average')
df_ties['rank_min'] = egen.rank(df_ties['value'], method='min')
df_ties['rank_max'] = egen.rank(df_ties['value'], method='max')

print("Handling tied values in ranking:")
print(df_ties)

# 3. Row Statistics Functions Demonstration

In [None]:
# 3. Row Statistics Functions Demonstration
print("Row Statistics Functions Demonstration")
print("="*30)

# Using basic data for row statistics demonstration
df_row = df_basic.copy()
columns_to_analyze = ['value1', 'value2', 'value3']

# rowmean - row mean
print("rowmean() - Row Mean")
print("Stata equivalent command: egen newvar = rowmean(value1 value2 value3)")
df_row['row_mean'] = egen.rowmean(df_row, columns_to_analyze)
print(df_row[columns_to_analyze + ['row_mean']])
print()

# rowtotal - row sum
print("rowtotal() - Row Sum")
print("Stata equivalent command: egen newvar = rowtotal(value1 value2 value3)")
df_row['row_total'] = egen.rowtotal(df_row, columns_to_analyze)
print(df_row[columns_to_analyze + ['row_total']])
print()

# rowmax - row maximum
print("rowmax() - Row Maximum")
print("Stata equivalent command: egen newvar = rowmax(value1 value2 value3)")
df_row['row_max'] = egen.rowmax(df_row, columns_to_analyze)
print(df_row[columns_to_analyze + ['row_max']])
print()

# rowmin - row minimum
print("rowmin() - Row Minimum")
print("Stata equivalent command: egen newvar = rowmin(value1 value2 value3)")
df_row['row_min'] = egen.rowmin(df_row, columns_to_analyze)
print(df_row[columns_to_analyze + ['row_min']])
print()

# rowcount - count of non-missing values per row
print("rowcount() - Count of Non-missing Values per Row")
print("Stata equivalent command: egen newvar = rownonmiss(value1 value2 value3)")
df_row['row_count'] = egen.rowcount(df_row, columns_to_analyze)
print(df_row[columns_to_analyze + ['row_count']])
print()

# rowsd - row standard deviation
print("rowsd() - Row Standard Deviation")
print("Stata equivalent command: egen newvar = rowsd(value1 value2 value3)")
df_row['row_sd'] = egen.rowsd(df_row, columns_to_analyze)
print(df_row[columns_to_analyze + ['row_sd']])
print()

# 4. Missing Value Handling Demonstration

In [None]:
# 4. Missing Value Handling Demonstration
print("Missing Value Handling Demonstration")
print("="*30)

df_missing_demo = df_missing.copy()
columns_missing = ['var1', 'var2', 'var3']

print("Original data with missing values:")
print(df_missing_demo)
print()

# Row statistics handling missing values
print("Row statistics handling missing values:")
df_missing_demo['mean_nonmissing'] = egen.rowmean(df_missing_demo, columns_missing)
df_missing_demo['total_nonmissing'] = egen.rowtotal(df_missing_demo, columns_missing)
df_missing_demo['count_nonmissing'] = egen.rowcount(df_missing_demo, columns_missing)
df_missing_demo['max_nonmissing'] = egen.rowmax(df_missing_demo, columns_missing)
df_missing_demo['min_nonmissing'] = egen.rowmin(df_missing_demo, columns_missing)

result_cols = ['mean_nonmissing', 'total_nonmissing', 'count_nonmissing', 'max_nonmissing', 'min_nonmissing']
print(df_missing_demo[columns_missing + result_cols])
print()

print("Notes:")
print("- rowmean: Calculates mean of non-missing values")
print("- rowtotal: Calculates sum of non-missing values")
print("- rowcount: Counts non-missing values")
print("- rowmax/rowmin: Calculates maximum/minimum of non-missing values")
print("- Missing values are automatically ignored in calculations")

# 5. Grouping Functions Demonstration

In [None]:
# 5. Grouping Functions Demonstration
print("Grouping Functions Demonstration")
print("="*30)

df_group = df_basic.copy()

# tag - mark first observation in each group
print("tag() - Mark First Observation in Each Group")
print("Stata equivalent command: egen newvar = tag(group)")
df_group['tag_group'] = egen.tag(df_group, ['group'])
print(df_group[['group', 'tag_group']])
print()

# count - count by group
print("count() - Count by Group")
print("Stata equivalent command: egen newvar = count(value1), by(group)")
df_group['count_by_group'] = egen.count(df_group['value1'], by=df_group['group'])
print(df_group[['group', 'value1', 'count_by_group']])
print()

# mean - group mean
print("mean() - Group Mean")
print("Stata equivalent command: egen newvar = mean(value1), by(group)")
df_group['mean_by_group'] = egen.mean(df_group['value1'], by=df_group['group'])
print(df_group[['group', 'value1', 'mean_by_group']])
print()

# sum - group sum
print("sum() - Group Sum")
print("Stata equivalent command: egen newvar = sum(value1), by(group)")
df_group['sum_by_group'] = egen.sum(df_group['value1'], by=df_group['group'])
print(df_group[['group', 'value1', 'sum_by_group']])
print()

# max/min - group maximum/minimum
print("max()/min() - Group Maximum/Minimum")
print("Stata equivalent command: egen newvar = max(value1), by(group)")
df_group['max_by_group'] = egen.max(df_group['value1'], by=df_group['group'])
df_group['min_by_group'] = egen.min(df_group['value1'], by=df_group['group'])
print(df_group[['group', 'value1', 'max_by_group', 'min_by_group']])
print()

# sd - group standard deviation
print("sd() - Group Standard Deviation")
print("Stata equivalent command: egen newvar = sd(value1), by(group)")
df_group['sd_by_group'] = egen.sd(df_group['value1'], by=df_group['group'])
print(df_group[['group', 'value1', 'sd_by_group']])

# 6. Advanced Functions Demonstration

In [None]:
# 6. Advanced Functions Demonstration
print("Advanced Functions Demonstration")
print("="*30)

df_advanced = df_country.copy()

# seq - generate sequence numbers
print("seq() - Generate Sequence Numbers")
print("Stata equivalent command: egen newvar = seq()")
try:
    df_advanced['sequence'] = egen.seq()
    print(df_advanced[['country', 'year', 'sequence']])
except Exception as e:
    print(f"seq() function may not accept parameters, trying no-parameter call: {e}")
    # Use alternative method to generate sequence
    df_advanced['sequence'] = range(1, len(df_advanced) + 1)
    print(df_advanced[['country', 'year', 'sequence']])
print()

# group - create group identifiers
print("group() - Create Group Identifiers")
print("Stata equivalent command: egen newvar = group(country)")
try:
    df_advanced['country_group'] = egen.group(df_advanced, ['country'])
    print(df_advanced[['country', 'country_group']])
    print()

    # Multi-variable grouping
    print("Multi-variable Grouping:")
    df_advanced['country_year_group'] = egen.group(df_advanced, ['country', 'year'])
    print(df_advanced[['country', 'year', 'country_year_group']])
    print()
except Exception as e:
    print(f"group() function call error: {e}")
    # Use pandas factorize as alternative
    df_advanced['country_group'] = pd.factorize(df_advanced['country'])[0] + 1
    print("Using alternative method to generate country grouping:")
    print(df_advanced[['country', 'country_group']])
    print()

# pc - calculate percentiles
print("pc() - Calculate Percentiles")
print("Stata equivalent command: egen newvar = pc(gdp)")
try:
    df_advanced['gdp_percentile'] = egen.pc(df_advanced['gdp'])
    print(df_advanced[['country', 'gdp', 'gdp_percentile']])
    print()

    # Calculate percentiles by group
    print("Calculate GDP Percentiles by Year:")
    df_advanced['gdp_percentile_by_year'] = df_advanced.groupby('year')['gdp'].transform(lambda x: egen.pc(x))
    print(df_advanced[['country', 'year', 'gdp', 'gdp_percentile_by_year']])
    print()
except Exception as e:
    print(f"pc() function call error: {e}")
    # Use pandas rank as alternative
    df_advanced['gdp_percentile'] = df_advanced['gdp'].rank(pct=True) * 100
    print("Using alternative method to calculate percentiles:")
    print(df_advanced[['country', 'gdp', 'gdp_percentile']])
    print()

# iqr - interquartile range
print("iqr() - Interquartile Range")
print("Stata equivalent command: egen newvar = iqr(gdp)")
try:
    df_advanced['gdp_iqr'] = egen.iqr(df_advanced['gdp'])
    print(f"Overall GDP Interquartile Range: {df_advanced['gdp_iqr'].iloc[0]:.2f}")

    # Calculate IQR by group
    df_advanced['gdp_iqr_by_year'] = df_advanced.groupby('year')['gdp'].transform(lambda x: egen.iqr(x))
    print(df_advanced[['year', 'gdp_iqr_by_year']].drop_duplicates())
except Exception as e:
    print(f"iqr() function call error: {e}")
    # Use pandas IQR calculation as alternative
    q75, q25 = df_advanced['gdp'].quantile([0.75, 0.25])
    iqr_value = q75 - q25
    df_advanced['gdp_iqr'] = iqr_value
    print(f"Using alternative method to calculate GDP IQR: {iqr_value:.2f}")
    print(df_advanced[['country', 'gdp', 'gdp_iqr']].head(3))

# 7. Real-world Case Study - Student Grade Data Analysis

In [None]:
# 7. Real-world Case Study - Student Grade Data Analysis
print("Real-world Case Study - Student Grade Data Analysis")
print("="*40)

# Create student grade data
np.random.seed(42)
students = pd.DataFrame({
    'student_id': range(1, 101),
    'class': np.random.choice(['A', 'B', 'C', 'D'], 100),
    'math': np.random.normal(75, 15, 100).round(1),
    'english': np.random.normal(78, 12, 100).round(1),
    'science': np.random.normal(73, 18, 100).round(1),
    'gender': np.random.choice(['M', 'F'], 100)
})

# Ensure grades are within reasonable range
students['math'] = students['math'].clip(0, 100)
students['english'] = students['english'].clip(0, 100)
students['science'] = students['science'].clip(0, 100)

print("Student grade data sample:")
print(students.head(10))
print()

# Calculate comprehensive statistics for each student
print("Individual Student Grade Statistics:")
subjects = ['math', 'english', 'science']

students['total_score'] = egen.rowtotal(students, subjects)
students['avg_score'] = egen.rowmean(students, subjects)
students['highest_score'] = egen.rowmax(students, subjects)
students['lowest_score'] = egen.rowmin(students, subjects)
students['score_std'] = egen.rowsd(students, subjects)

print(students[['student_id'] + subjects + ['total_score', 'avg_score', 'highest_score', 'lowest_score', 'score_std']].head())
print()

# Class-level statistics
print("Class Statistics:")
students['class_avg_math'] = egen.mean(students['math'], by=students['class'])
students['class_avg_english'] = egen.mean(students['english'], by=students['class'])
students['class_avg_science'] = egen.mean(students['science'], by=students['class'])
students['class_size'] = egen.count(students['student_id'], by=students['class'])

class_summary = students[['class', 'class_avg_math', 'class_avg_english', 'class_avg_science', 'class_size']].drop_duplicates().sort_values('class')
print(class_summary)
print()

# Ranking analysis
print("Ranking Analysis:")
students['math_rank_overall'] = egen.rank(students['math'], method='min')
students['math_rank_in_class'] = students.groupby('class')['math'].rank(method='min')
students['avg_rank_overall'] = egen.rank(students['avg_score'], method='min')

print("Math grade ranking comparison:")
print(students[['student_id', 'class', 'math', 'math_rank_overall', 'math_rank_in_class']].head(10))
print()

# Gender analysis
print("Gender Difference Analysis:")
students['math_avg_by_gender'] = egen.mean(students['math'], by=students['gender'])
students['english_avg_by_gender'] = egen.mean(students['english'], by=students['gender'])
students['science_avg_by_gender'] = egen.mean(students['science'], by=students['gender'])

gender_summary = students[['gender', 'math_avg_by_gender', 'english_avg_by_gender', 'science_avg_by_gender']].drop_duplicates()
print(gender_summary)

# 8. Stata to Python Conversion Reference Table

In [None]:
# 8. Stata to Python Conversion Reference Table
print("Stata to Python Conversion Reference Table")
print("="*40)

stata_python_mapping = [
    ("egen newvar = rank(var)", "df['newvar'] = egen.rank(df['var'])"),
    ("egen newvar = rowmean(var1-var3)", "df['newvar'] = egen.rowmean(df, ['var1', 'var2', 'var3'])"),
    ("egen newvar = rowtotal(var1-var3)", "df['newvar'] = egen.rowtotal(df, ['var1', 'var2', 'var3'])"),
    ("egen newvar = rowmax(var1-var3)", "df['newvar'] = egen.rowmax(df, ['var1', 'var2', 'var3'])"),
    ("egen newvar = rowmin(var1-var3)", "df['newvar'] = egen.rowmin(df, ['var1', 'var2', 'var3'])"),
    ("egen newvar = rownonmiss(var1-var3)", "df['newvar'] = egen.rowcount(df, ['var1', 'var2', 'var3'])"),
    ("egen newvar = rowsd(var1-var3)", "df['newvar'] = egen.rowsd(df, ['var1', 'var2', 'var3'])"),
    ("egen newvar = tag(group)", "df['newvar'] = egen.tag(df, ['group'])"),
    ("egen newvar = count(var), by(group)", "df['newvar'] = egen.count(df['var'], by=df['group'])"),
    ("egen newvar = mean(var), by(group)", "df['newvar'] = egen.mean(df['var'], by=df['group'])"),
    ("egen newvar = sum(var), by(group)", "df['newvar'] = egen.sum(df['var'], by=df['group'])"),
    ("egen newvar = max(var), by(group)", "df['newvar'] = egen.max(df['var'], by=df['group'])"),
    ("egen newvar = min(var), by(group)", "df['newvar'] = egen.min(df['var'], by=df['group'])"),
    ("egen newvar = sd(var), by(group)", "df['newvar'] = egen.sd(df['var'], by=df['group'])"),
    ("egen newvar = seq()", "df['newvar'] = egen.seq(len(df))"),
    ("egen newvar = group(var1 var2)", "df['newvar'] = egen.group(df, ['var1', 'var2'])"),
    ("egen newvar = pc(var)", "df['newvar'] = egen.pc(df['var'])"),
    ("egen newvar = iqr(var)", "df['newvar'] = egen.iqr(df['var'])")
]

print(f"{'Stata Command':<35} | {'PyEgen Equivalent'}")
print("-" * 80)
for stata, python in stata_python_mapping:
    print(f"{stata:<35} | {python}")

print()
print("Important Notes:")
print("1. PyEgen automatically handles missing values, consistent with Stata behavior")
print("2. All functions support pandas DataFrame and Series")
print("3. Grouping operations use the 'by' parameter, similar to Stata's by() option")
print("4. Ranking methods can be adjusted via the 'method' parameter")
print("5. Row statistics functions require providing a list of column names as the second parameter")

# 9. Performance Testing and Comprehensive Testing

In [None]:
# 9. Performance Testing and Comprehensive Testing
print("Performance Testing and Comprehensive Testing")
print("="*30)

import time

# Create large dataset for performance testing
print("Creating large dataset (10,000 rows) for performance testing...")
np.random.seed(123)
large_df = pd.DataFrame({
    'group': np.random.choice(['A', 'B', 'C', 'D', 'E'], 10000),
    'value1': np.random.normal(100, 20, 10000),
    'value2': np.random.normal(50, 15, 10000),
    'value3': np.random.normal(75, 25, 10000),
    'value4': np.random.normal(60, 18, 10000)
})

print(f"Dataset size: {large_df.shape}")
print()

# Performance testing function
def time_function(func, *args, **kwargs):
    start_time = time.time()
    result = func(*args, **kwargs)
    end_time = time.time()
    return result, end_time - start_time

# Test performance of various functions
print("Performance Test Results:")
test_columns = ['value1', 'value2', 'value3', 'value4']

# Row statistics function performance testing
_, time_rowmean = time_function(egen.rowmean, large_df, test_columns)
_, time_rowtotal = time_function(egen.rowtotal, large_df, test_columns)
_, time_rowmax = time_function(egen.rowmax, large_df, test_columns)
_, time_rowcount = time_function(egen.rowcount, large_df, test_columns)

print(f"rowmean (10,000 rows): {time_rowmean:.4f} seconds")
print(f"rowtotal (10,000 rows): {time_rowtotal:.4f} seconds")
print(f"rowmax (10,000 rows): {time_rowmax:.4f} seconds")
print(f"rowcount (10,000 rows): {time_rowcount:.4f} seconds")
print()

# Grouping function performance testing
_, time_group_mean = time_function(egen.mean, large_df['value1'], by=large_df['group'])
_, time_group_count = time_function(egen.count, large_df['value1'], by=large_df['group'])
_, time_tag = time_function(egen.tag, large_df, ['group'])

print(f"Group mean (10,000 rows): {time_group_mean:.4f} seconds")
print(f"Group count (10,000 rows): {time_group_count:.4f} seconds")
print(f"Tag marking (10,000 rows): {time_tag:.4f} seconds")
print()

# Comprehensive testing - simulate real data analysis workflow
print("Comprehensive Workflow Testing:")
start_total = time.time()

# Step 1: Row statistics
large_df['avg_score'] = egen.rowmean(large_df, test_columns)
large_df['total_score'] = egen.rowtotal(large_df, test_columns)

# Step 2: Ranking
large_df['rank_avg'] = egen.rank(large_df['avg_score'])

# Step 3: Group statistics
large_df['group_avg'] = egen.mean(large_df['avg_score'], by=large_df['group'])
large_df['group_count'] = egen.count(large_df['avg_score'], by=large_df['group'])

# Step 4: Tagging and grouping
large_df['group_tag'] = egen.tag(large_df, ['group'])
large_df['group_id'] = egen.group(large_df, ['group'])

end_total = time.time()
print(f"Complete workflow processing 10,000 rows of data: {end_total - start_total:.4f} seconds")
print()

# Validate results accuracy
print("Result Validation:")
print("First 5 rows result sample:")
result_cols = ['group', 'avg_score', 'rank_avg', 'group_avg', 'group_count', 'group_tag']
print(large_df[result_cols].head())
print()

print("Group summary validation:")
group_summary = large_df.groupby('group').agg({
    'avg_score': ['count', 'mean'],
    'group_tag': 'sum'
}).round(2)
group_summary.columns = ['Actual Count', 'Actual Mean', 'Tag Count']
print(group_summary)

# 10. Summary and Best Practices

In [None]:
# 10. Summary and Best Practices
print("PyEgen Summary and Best Practices")
print("="*35)

print("Main Feature Categories:")
print()

print("1. Row-wise Functions:")
print("   - rowmean()  - Row mean")
print("   - rowtotal() - Row sum") 
print("   - rowmax()   - Row maximum")
print("   - rowmin()   - Row minimum")
print("   - rowcount() - Count of non-missing values per row")
print("   - rowsd()    - Row standard deviation")
print()

print("2. Group-wise Functions:")
print("   - count()    - Count (supports grouping)")
print("   - mean()     - Mean (supports grouping)")
print("   - sum()      - Sum (supports grouping)")
print("   - max()      - Maximum (supports grouping)")
print("   - min()      - Minimum (supports grouping)")
print("   - sd()       - Standard deviation (supports grouping)")
print()

print("3. Identification and Ranking Functions:")
print("   - tag()      - Mark first observation in each group")
print("   - rank()     - Ranking (multiple methods)")
print("   - group()    - Create group identifiers")
print()

print("4. Advanced Statistical Functions:")
print("   - seq()      - Generate sequence numbers")
print("   - pc()       - Percentiles")
print("   - iqr()      - Interquartile range")
print()

print("Best Practice Recommendations:")
print()
print("Usage Tips:")
print("   1. All functions automatically handle missing values")
print("   2. Row functions require providing column name list: egen.rowmean(df, ['col1', 'col2'])")
print("   3. Grouping operations use 'by' parameter: egen.mean(df['value'], by=df['group'])")
print("   4. Ranking can specify method: egen.rank(series, method='min')")
print("   5. Can seamlessly integrate with pandas operations")
print()

print("Important Considerations:")
print("   1. Ensure input data types are correct (numeric data)")
print("   2. Consider memory usage for large datasets")
print("   3. Grouping variables should be appropriate data types")
print("   4. Row statistics functions require columns to exist in DataFrame")
print()

print("Advantages Over Native Pandas:")
print("   - Syntax closer to Stata, lower learning curve")
print("   - Unified API design")
print("   - Automatic missing value handling consistent with Stata")
print("   - Simplifies common data operations")
print()

print("PyEgen Feature Demonstration Complete!")
print("   This library provides researchers transitioning from Stata to Python")
print("   with familiar data manipulation interfaces, making data analysis more efficient.")

In [None]:
# END