In [None]:
# Install pandas and visualization libraries
!pip install pandas matplotlib seaborn

# Pandas Tutorial - 5 Minute Guide

**Pandas** is the most popular Python library for data manipulation and analysis. It provides:
- Fast, flexible data structures (DataFrame, Series)
- Data cleaning and transformation tools
- Built-in visualization capabilities
- Excellent integration with the Python ecosystem

Perfect for data analysis, reporting, and exploratory data analysis (EDA).

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load CSV data
df = pd.read_csv('synthetic_customers_120k.csv')

# Alternative loading examples (commented):
# df = pd.read_json('data.json')  # JSON files
# df = pd.read_excel('data.xlsx')  # Excel files
# df = pd.read_parquet('data.parquet')  # Parquet files

# Create DataFrame from dictionary:
# data = {'name': ['Alice', 'Bob'], 'age': [25, 30]}
# df = pd.DataFrame(data)

# Quick data exploration
print("Dataset shape:", df.shape)
print("\nColumn names:")
print(df.columns.tolist())
print("\nFirst 3 rows:")
df.head(3) # Nicely show the column

In [None]:
# Data overview and info
print("Data types:")
print(df.dtypes)
print("\nSummary statistics:")
df.describe()

## 📊 Data Visualization

In [None]:
# Quick plots
fig, axes = plt.subplots(2, 2, figsize=(12, 8))

# Distribution of purchase values
df['avg_purchase_value'].hist(bins=50, ax=axes[0,0])
axes[0,0].set_title('Purchase Value Distribution')

# Customers by state (top 10)
df['state'].value_counts().head(10).plot(kind='bar', ax=axes[0,1])
axes[0,1].set_title('Top 10 States by Customers')

# Scatter: purchases vs spending
axes[1,0].scatter(df['num_purchases'], df['total_spent'], alpha=0.5)
axes[1,0].set_xlabel('Number of Purchases')
axes[1,0].set_ylabel('Total Spent')
axes[1,0].set_title('Purchases vs Spending')

# Category preferences
df['preferred_category'].value_counts().plot(kind='pie', ax=axes[1,1])
axes[1,1].set_title('Category Preferences')

plt.tight_layout()
plt.show()

## 🔍 Data Filtering & Selection

In [None]:
# Filter by state
california_customers = df[df['state'] == 'CA']
print(f"California customers: {len(california_customers)}")

# Multiple conditions AND -> &; OR -> |; each condition in ()
high_value = df[(df['total_spent'] > 1000) & (df['account_status'] == 'Active')]
print(f"High-value active customers: {len(high_value)}")

# Select specific columns, note the double list
customer_summary = df[['first_name', 'last_name', 'state', 'total_spent']].head()
print("\nCustomer summary:")
customer_summary

## 📈 GroupBy Analysis

In [None]:
# Group by state - key metrics - agg stands for aggregate a very common operation
state_stats = df.groupby('state').agg({
    'customer_id': 'count',
    'total_spent': ['mean', 'sum'],
    'avg_purchase_value': 'mean',
    'loyalty_points': 'mean'
}).round(2)

print("Top 10 states by total revenue:")
state_stats.sort_values(('total_spent', 'sum'), ascending=False).head(10)

In [None]:
# Category analysis
category_analysis = df.groupby('preferred_category').agg({
    'customer_id': 'count',
    'total_spent': 'mean',
    'num_purchases': 'mean',
    'loyalty_points': 'mean'
}).round(2)

category_analysis.columns = ['Customers', 'Avg_Spent', 'Avg_Purchases', 'Avg_Loyalty']
print("Category performance:")
category_analysis

## 💾 Data Export

In [None]:
# Export filtered data
high_value.to_csv('high_value_customers.csv', index=False)
print(f"Exported {len(high_value)} high-value customers to CSV")

# Export summary stats
state_stats.to_excel('state_analysis.xlsx')
print("Exported state analysis to Excel")

# Other export formats:
# df.to_json('data.json')  # JSON
# df.to_parquet('data.parquet')  # Parquet (efficient)
# df.to_html('report.html')  # HTML table

## ⚡ Essential Operations

In [None]:
# Data cleaning
print("Missing values per column:")
print(df.isnull().sum())

# Create new calculated columns
df['purchase_efficiency'] = df['total_spent'] / df['num_purchases']
df['is_premium'] = df['total_spent'] > df['total_spent'].quantile(0.75)

# Quick value counts
print("\nAccount status distribution:")
print(df['account_status'].value_counts())

print("\nTop 5 cities:")
print(df['city'].value_counts().head())

In [None]:
# Quick correlation analysis
numeric_cols = ['num_purchases', 'avg_purchase_value', 'total_spent', 'loyalty_points']
correlation_matrix = df[numeric_cols].corr()

plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Feature Correlations')
plt.show()

print("Dataset ready for analysis! 🚀")