In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pymongo import MongoClient
import json

In [None]:
# Connect to MongoDB and fetch data
client = MongoClient('mongodb://localhost:27017/')
db = client['survey_db']
users_collection = db['users']

# Convert MongoDB data to DataFrame
data = list(users_collection.find())
df = pd.DataFrame(data)

In [None]:
# Flatten the expenses dictionary into separate columns
expense_df = pd.json_normalize(df['expenses'])
df = pd.concat([df.drop('expenses', axis=1), expense_df], axis=1)

In [None]:
# Save to CSV
df.to_csv('survey_data.csv', index=False)

In [None]:
# Analysis 1: Ages with Highest Income
plt.figure(figsize=(12, 6))
sns.scatterplot(data=df, x='age', y='income')
plt.title('Income Distribution by Age')
plt.xlabel('Age')
plt.ylabel('Income ($)')
plt.savefig('income_by_age.png')
plt.close()

In [None]:
# Analysis 2: Average Spending by Gender for Each Category
expense_categories = ['utilities', 'entertainment', 'school_fees', 'shopping', 'healthcare']
gender_spending = df.groupby('gender')[expense_categories].mean()

plt.figure(figsize=(12, 6))
gender_spending.plot(kind='bar')
plt.title('Average Spending by Gender Across Categories')
plt.xlabel('Gender')
plt.ylabel('Amount ($)')
plt.legend(title='Expense Category', bbox_to_anchor=(1.05, 1))
plt.tight_layout()
plt.savefig('spending_by_gender.png')
plt.close()

In [None]:
# Additional Analysis: Total Expenses Distribution
df['total_expenses'] = df[expense_categories].sum(axis=1)
plt.figure(figsize=(10, 6))
sns.boxplot(data=df, x='gender', y='total_expenses')
plt.title('Total Expenses Distribution by Gender')
plt.xlabel('Gender')
plt.ylabel('Total Expenses ($)')
plt.savefig('total_expenses_distribution.png')
plt.close()


In [None]:
# Save summary statistics
summary_stats = {
    'total_respondents': len(df),
    'average_income': df['income'].mean(),
    'age_range': f"{df['age'].min()} - {df['age'].max()}",
    'gender_distribution': df['gender'].value_counts().to_dict(),
    'average_expenses_by_category': df[expense_categories].mean().to_dict()
}

with open('summary_statistics.json', 'w') as f:
    json.dump(summary_stats, f, indent=4)