# Input/Output Operations - Solutions

Reading and writing data with read_csv(), to_csv(), read_excel(), JSON, Parquet, chunking, and dtypes optimization.

## Question 1
Create a DataFrame and save it to CSV format, then read it back with specific data types.

In [None]:
import pandas as pd
import numpy as np
import os

# Create sample data
df_sample = pd.DataFrame({
    'ID': [1, 2, 3, 4, 5],
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
    'Score': [85.5, 90.0, 78.5, 92.0, 88.5],
    'Grade': ['B', 'A', 'C', 'A', 'B']
})
print("Original DataFrame:")
print(df_sample)
print(df_sample.dtypes)

# Save to CSV
df_sample.to_csv('sample_data.csv', index=False)
print("\nSaved to CSV")

# Read back with specific dtypes
df_read = pd.read_csv('sample_data.csv', dtype={
    'ID': 'int32',
    'Name': 'string',
    'Score': 'float32',
    'Grade': 'category'
})
print("\nRead back with specific dtypes:")
print(df_read.dtypes)

## Question 2
Read a CSV file with custom parameters (separator, header, index_col, na_values).

In [None]:
# Create a custom CSV with different separator and missing values
custom_data = """ID;Name;Score;Status
1;Alice;85.5;Pass
2;Bob;N/A;Pass
3;Charlie;78.5;Fail
4;David;92.0;Pass
5;Eve;Missing;Pass"""

with open('custom_data.csv', 'w') as f:
    f.write(custom_data)

# Read with custom parameters
df_custom = pd.read_csv('custom_data.csv', 
                       sep=';', 
                       na_values=['N/A', 'Missing'],
                       index_col='ID')
print("Custom CSV data:")
print(df_custom)

## Question 3
Save a DataFrame to Excel format with multiple sheets and read it back.

In [None]:
# Create multiple DataFrames
df_sheet1 = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
df_sheet2 = pd.DataFrame({'X': [7, 8, 9], 'Y': [10, 11, 12]})

# Save to Excel with multiple sheets
with pd.ExcelWriter('multi_sheet.xlsx') as writer:
    df_sheet1.to_excel(writer, sheet_name='Sheet1', index=False)
    df_sheet2.to_excel(writer, sheet_name='Sheet2', index=False)

print("Saved to Excel with multiple sheets")

# Read back specific sheets
df_read_sheet1 = pd.read_excel('multi_sheet.xlsx', sheet_name='Sheet1')
df_read_sheet2 = pd.read_excel('multi_sheet.xlsx', sheet_name='Sheet2')

print("\nSheet1:")
print(df_read_sheet1)
print("\nSheet2:")
print(df_read_sheet2)

# Read all sheets at once
all_sheets = pd.read_excel('multi_sheet.xlsx', sheet_name=None)
print("\nAll sheets:", list(all_sheets.keys()))

## Question 4
Convert a DataFrame to JSON format and read it back, exploring different orient options.

In [None]:
df_json = pd.DataFrame({
    'Name': ['Alice', 'Bob'],
    'Age': [25, 30],
    'City': ['NYC', 'LA']
})

print("Original DataFrame:")
print(df_json)

# Different JSON orientations
orientations = ['records', 'index', 'values', 'columns']

for orient in orientations:
    json_str = df_json.to_json(orient=orient)
    print(f"\nJSON with orient='{orient}':")
    print(json_str)
    
    # Read back
    df_back = pd.read_json(json_str, orient=orient)
    print(f"Read back successfully: {df_back.shape}")

## Question 5
Save and read data in Parquet format, comparing file sizes with CSV.

In [None]:
# Create larger dataset for comparison
np.random.seed(42)
large_df = pd.DataFrame({
    'ID': range(1000),
    'Value1': np.random.randn(1000),
    'Value2': np.random.randn(1000),
    'Category': np.random.choice(['A', 'B', 'C'], 1000),
    'Text': ['Sample text ' + str(i) for i in range(1000)]
})

# Save in different formats
large_df.to_csv('large_data.csv', index=False)
large_df.to_parquet('large_data.parquet', index=False)

print("File sizes:")
print(f"CSV: {os.path.getsize('large_data.csv')} bytes")
print(f"Parquet: {os.path.getsize('large_data.parquet')} bytes")

# Read back and verify
df_csv = pd.read_csv('large_data.csv')
df_parquet = pd.read_parquet('large_data.parquet')

print(f"\nCSV shape: {df_csv.shape}")
print(f"Parquet shape: {df_parquet.shape}")
print(f"DataFrames are equal: {df_csv.equals(df_parquet)}")

## Question 6
Read a large CSV file in chunks and process each chunk separately.

In [None]:
# Create a large CSV file
large_data = pd.DataFrame({
    'ID': range(10000),
    'Value': np.random.randn(10000)
})
large_data.to_csv('large_file.csv', index=False)

print("Created large CSV file with 10,000 rows")

# Read in chunks
chunk_size = 1000
chunk_results = []

for i, chunk in enumerate(pd.read_csv('large_file.csv', chunksize=chunk_size)):
    # Process each chunk (example: calculate mean)
    chunk_mean = chunk['Value'].mean()
    chunk_results.append(chunk_mean)
    print(f"Chunk {i+1}: Shape {chunk.shape}, Mean: {chunk_mean:.4f}")
    
    if i == 4:  # Show first 5 chunks only
        break

print(f"\nOverall mean of processed chunks: {np.mean(chunk_results):.4f}")

## Question 7
Optimize memory usage by specifying appropriate dtypes when reading data.

In [None]:
# Create data with different numeric ranges
optim_data = pd.DataFrame({
    'small_int': np.random.randint(0, 100, 1000),  # Can use int8
    'large_int': np.random.randint(0, 1000000, 1000),  # Needs int32
    'float_val': np.random.randn(1000),  # Can use float32
    'category': np.random.choice(['A', 'B', 'C'], 1000)  # Can use category
})
optim_data.to_csv('optimization_test.csv', index=False)

# Read with default dtypes
df_default = pd.read_csv('optimization_test.csv')
print("Default dtypes:")
print(df_default.dtypes)
print(f"Memory usage: {df_default.memory_usage(deep=True).sum()} bytes")

# Read with optimized dtypes
df_optimized = pd.read_csv('optimization_test.csv', dtype={
    'small_int': 'int8',
    'large_int': 'int32', 
    'float_val': 'float32',
    'category': 'category'
})
print("\nOptimized dtypes:")
print(df_optimized.dtypes)
print(f"Memory usage: {df_optimized.memory_usage(deep=True).sum()} bytes")

memory_saved = df_default.memory_usage(deep=True).sum() - df_optimized.memory_usage(deep=True).sum()
print(f"\nMemory saved: {memory_saved} bytes ({memory_saved/df_default.memory_usage(deep=True).sum()*100:.1f}%)")

## Question 8
Read only specific columns from a CSV file to save memory.

In [None]:
# Read all columns
df_all = pd.read_csv('large_data.csv')
print(f"All columns: {list(df_all.columns)}")
print(f"Shape: {df_all.shape}")
print(f"Memory usage: {df_all.memory_usage(deep=True).sum()} bytes")

# Read only specific columns
selected_columns = ['ID', 'Value1', 'Category']
df_selected = pd.read_csv('large_data.csv', usecols=selected_columns)
print(f"\nSelected columns: {list(df_selected.columns)}")
print(f"Shape: {df_selected.shape}")
print(f"Memory usage: {df_selected.memory_usage(deep=True).sum()} bytes")

memory_saved = df_all.memory_usage(deep=True).sum() - df_selected.memory_usage(deep=True).sum()
print(f"\nMemory saved: {memory_saved} bytes")

## Question 9
Handle encoding issues when reading text files with special characters.

In [None]:
# Create file with special characters
special_text = "Name,Description\nCafé,Delicious café with résumé\nNaïve,Naïve approach to piñata"

# Save with UTF-8 encoding
with open('special_chars.csv', 'w', encoding='utf-8') as f:
    f.write(special_text)

print("Created file with special characters")

# Try reading with different encodings
try:
    df_ascii = pd.read_csv('special_chars.csv', encoding='ascii')
    print("ASCII encoding worked")
except UnicodeDecodeError:
    print("ASCII encoding failed - as expected")

# Read with UTF-8
df_utf8 = pd.read_csv('special_chars.csv', encoding='utf-8')
print("\nUTF-8 encoding:")
print(df_utf8)

# Read with error handling
df_ignore = pd.read_csv('special_chars.csv', encoding='ascii', encoding_errors='ignore')
print("\nWith error='ignore' (characters removed):")
print(df_ignore)

## Question 10
Compare file sizes and read speeds between different formats (CSV, Parquet, Excel) for the same data.

In [None]:
import time

# Create test data
test_df = pd.DataFrame({
    'ID': range(5000),
    'Value1': np.random.randn(5000),
    'Value2': np.random.randn(5000),
    'Category': np.random.choice(['A', 'B', 'C', 'D'], 5000),
    'Date': pd.date_range('2023-01-01', periods=5000, freq='H')
})

print(f"Test DataFrame shape: {test_df.shape}")

# Save in different formats and measure time
formats = {
    'CSV': (lambda: test_df.to_csv('speed_test.csv', index=False), 
            lambda: pd.read_csv('speed_test.csv')),
    'Parquet': (lambda: test_df.to_parquet('speed_test.parquet', index=False), 
                lambda: pd.read_parquet('speed_test.parquet')),
    'Excel': (lambda: test_df.to_excel('speed_test.xlsx', index=False), 
              lambda: pd.read_excel('speed_test.xlsx'))
}

results = {}

for format_name, (save_func, read_func) in formats.items():
    # Measure save time
    start_time = time.time()
    save_func()
    save_time = time.time() - start_time
    
    # Measure file size
    filename = f'speed_test.{format_name.lower()}'
    if format_name == 'Excel':
        filename = 'speed_test.xlsx'
    elif format_name == 'Parquet':
        filename = 'speed_test.parquet'
    else:
        filename = 'speed_test.csv'
    
    file_size = os.path.getsize(filename)
    
    # Measure read time
    start_time = time.time()
    df_read = read_func()
    read_time = time.time() - start_time
    
    results[format_name] = {
        'save_time': save_time,
        'read_time': read_time,
        'file_size': file_size,
        'rows_read': len(df_read)
    }

print("\nPerformance Comparison:")
print(f"{'Format':<10} {'Save(s)':<10} {'Read(s)':<10} {'Size(KB)':<10} {'Rows':<10}")
print("-" * 50)
for format_name, stats in results.items():
    print(f"{format_name:<10} {stats['save_time']:<10.3f} {stats['read_time']:<10.3f} {stats['file_size']/1024:<10.1f} {stats['rows_read']:<10}")

# Clean up files
for filename in ['speed_test.csv', 'speed_test.parquet', 'speed_test.xlsx']:
    if os.path.exists(filename):
        os.remove(filename)