In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import json
import os
from datetime import datetime

print(f"Processing file: {input_file}")
print(f"Job ID: {job_id}")


In [None]:
# Read the input file
try:
    if input_file.lower().endswith('.csv'):
        df = pd.read_csv(input_file)
    elif input_file.lower().endswith(('.xlsx', '.xls')):
        df = pd.read_excel(input_file)
    elif input_file.lower().endswith('.json'):
        df = pd.read_json(input_file)
    else:
        # Try CSV as default
        df = pd.read_csv(input_file)
    
    print(f"Successfully loaded file with shape: {df.shape}")
    print(f"Columns: {list(df.columns)}")
    
except Exception as e:
    print(f"Error reading file: {str(e)}")
    raise

In [None]:
# Basic data cleaning
original_shape = df.shape
print(f"Original shape: {original_shape}")

# Remove completely empty rows
df_cleaned = df.dropna(how='all')
print(f"After removing empty rows: {df_cleaned.shape}")

# Basic info about the dataset
print("\nDataset Info:")
print(f"Number of rows: {len(df_cleaned)}")
print(f"Number of columns: {len(df_cleaned.columns)}")
print(f"Missing values per column:")
print(df_cleaned.isnull().sum())

In [None]:
# Generate summary statistics
summary_data = {
    "job_id": job_id,
    "filename": os.path.basename(input_file),
    "processed_at": datetime.now().isoformat(),
    "original_shape": original_shape,
    "cleaned_shape": df_cleaned.shape,
    "columns": list(df_cleaned.columns),
    "dtypes": df_cleaned.dtypes.astype(str).to_dict(),
    "missing_values": df_cleaned.isnull().sum().to_dict(),
    "numeric_columns": list(df_cleaned.select_dtypes(include=[np.number]).columns),
    "categorical_columns": list(df_cleaned.select_dtypes(include=['object']).columns)
}

# Add descriptive statistics for numeric columns
numeric_cols = df_cleaned.select_dtypes(include=[np.number]).columns
if len(numeric_cols) > 0:
    desc_stats = df_cleaned[numeric_cols].describe()
    summary_data["descriptive_statistics"] = desc_stats.to_dict()
    print("\nDescriptive Statistics:")
    print(desc_stats)
else:
    summary_data["descriptive_statistics"] = {}
    print("No numeric columns found for descriptive statistics")

print(f"\nSummary data keys: {list(summary_data.keys())}")

In [None]:
# Create visualization
plt.style.use('default')
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle(f'Data Analysis Report - {os.path.basename(input_file)}', fontsize=16, fontweight='bold')

# Plot 1: Data overview
axes[0, 0].bar(['Total Rows', 'Total Columns', 'Numeric Cols', 'Text Cols'], 
               [len(df_cleaned), len(df_cleaned.columns), len(summary_data['numeric_columns']), len(summary_data['categorical_columns'])],
               color=['#3498db', '#2ecc71', '#f39c12', '#e74c3c'])
axes[0, 0].set_title('Dataset Overview')
axes[0, 0].set_ylabel('Count')

# Plot 2: Missing values
missing_data = df_cleaned.isnull().sum()
if missing_data.sum() > 0:
    missing_data = missing_data[missing_data > 0].head(10)  # Top 10 columns with missing values
    axes[0, 1].bar(range(len(missing_data)), missing_data.values, color='#e74c3c')
    axes[0, 1].set_title('Missing Values by Column')
    axes[0, 1].set_ylabel('Missing Count')
    axes[0, 1].set_xticks(range(len(missing_data)))
    axes[0, 1].set_xticklabels(missing_data.index, rotation=45, ha='right')
else:
    axes[0, 1].text(0.5, 0.5, 'No Missing Values!', ha='center', va='center', transform=axes[0, 1].transAxes, fontsize=14)
    axes[0, 1].set_title('Missing Values')

# Plot 3: Histogram of first numeric column
if len(numeric_cols) > 0:
    first_numeric = numeric_cols[0]
    df_cleaned[first_numeric].hist(bins=30, ax=axes[1, 0], color='#3498db', alpha=0.7)
    axes[1, 0].set_title(f'Distribution of {first_numeric}')
    axes[1, 0].set_xlabel(first_numeric)
    axes[1, 0].set_ylabel('Frequency')
else:
    axes[1, 0].text(0.5, 0.5, 'No Numeric Columns\nfor Histogram', ha='center', va='center', transform=axes[1, 0].transAxes, fontsize=12)
    axes[1, 0].set_title('Distribution Plot')

# Plot 4: Correlation heatmap (if multiple numeric columns)
if len(numeric_cols) > 1:
    correlation_matrix = df_cleaned[numeric_cols].corr()
    im = axes[1, 1].imshow(correlation_matrix, cmap='coolwarm', aspect='auto', vmin=-1, vmax=1)
    axes[1, 1].set_title('Correlation Matrix')
    axes[1, 1].set_xticks(range(len(numeric_cols)))
    axes[1, 1].set_yticks(range(len(numeric_cols)))
    axes[1, 1].set_xticklabels(numeric_cols, rotation=45, ha='right')
    axes[1, 1].set_yticklabels(numeric_cols)
    plt.colorbar(im, ax=axes[1, 1])
else:
    axes[1, 1].text(0.5, 0.5, 'Need 2+ Numeric Columns\nfor Correlation', ha='center', va='center', transform=axes[1, 1].transAxes, fontsize=12)
    axes[1, 1].set_title('Correlation Matrix')

plt.tight_layout()

# Save the plot
try:
    plt.savefig(output_image, dpi=300, bbox_inches='tight')
    print(f"Plot saved to: {output_image}")
    summary_data["visualization_created"] = True
except Exception as e:
    print(f"Error saving plot: {str(e)}")
    summary_data["visualization_created"] = False

plt.show()

In [None]:
# Save summary to JSON file
try:
    with open(summary_file, 'w') as f:
        json.dump(summary_data, f, indent=2, default=str)
    print(f"Summary saved to: {summary_file}")
    print("Analysis completed successfully!")
except Exception as e:
    print(f"Error saving summary: {str(e)}")
    raise