# Data Analysis of MongoDB Collections

Detailed analysis of three datasets:
1. Organizations (100 records)
2. People (100 records)
3. Customers (100 records)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Set display options
pd.set_option('display.max_columns', None)

# Load datasets
organizations_df = pd.read_csv('organizations-100.csv')
people_df = pd.read_csv('people-100.csv')
customers_df = pd.read_csv('customers-100.csv')

OSError: 'seaborn' is not a valid package style, path of style file, URL of style file, or library style name (library styles are listed in `style.available`)

## 1. Organizations Dataset Analysis

In [None]:
print("\n=== Organizations Dataset Information ===")
organizations_df.info()

print("\n=== First 5 Records ===")
display(organizations_df.head())

print("\n=== Numerical Columns Statistics ===")
display(organizations_df.describe())

print("\n=== Null Values Count ===")
display(organizations_df.isnull().sum())

# Plot histograms
plt.figure(figsize=(15, 5))
organizations_df['Number of employees'].hist(bins=30)
plt.title('Distribution of Number of Employees')
plt.xlabel('Number of Employees')
plt.ylabel('Frequency')
plt.show()

plt.figure(figsize=(15, 5))
organizations_df['Founded'].hist(bins=30)
plt.title('Distribution of Founded Year')
plt.xlabel('Year')
plt.ylabel('Frequency')
plt.show()

## 2. People Dataset Analysis

In [None]:
print("\n=== People Dataset Information ===")
people_df.info()

print("\n=== First 5 Records ===")
display(people_df.head())

print("\n=== Numerical Columns Statistics ===")
display(people_df.describe())

print("\n=== Null Values Count ===")
display(people_df.isnull().sum())

# Convert Date of birth to datetime
people_df['Date of birth'] = pd.to_datetime(people_df['Date of birth'])

# Plot age distribution
plt.figure(figsize=(15, 5))
people_df['Date of birth'].dt.year.hist(bins=30)
plt.title('Distribution of Birth Years')
plt.xlabel('Year')
plt.ylabel('Frequency')
plt.show()

# Gender distribution
plt.figure(figsize=(8, 5))
people_df['Sex'].value_counts().plot(kind='bar')
plt.title('Gender Distribution')
plt.xlabel('Gender')
plt.ylabel('Count')
plt.show()

## 3. Customers Dataset Analysis

In [None]:
print("\n=== Customers Dataset Information ===")
customers_df.info()

print("\n=== First 5 Records ===")
display(customers_df.head())

print("\n=== Numerical Columns Statistics ===")
display(customers_df.describe())

print("\n=== Null Values Count ===")
display(customers_df.isnull().sum())

# Convert Subscription Date to datetime
customers_df['Subscription Date'] = pd.to_datetime(customers_df['Subscription Date'])

# Plot subscription date distribution
plt.figure(figsize=(15, 5))
customers_df['Subscription Date'].dt.year.hist(bins=12)
plt.title('Distribution of Subscription Years')
plt.xlabel('Year')
plt.ylabel('Frequency')
plt.show()

# Country distribution (top 10)
plt.figure(figsize=(12, 5))
customers_df['Country'].value_counts().head(10).plot(kind='bar')
plt.title('Top 10 Countries')
plt.xlabel('Country')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()

## Summary of Findings

1. Organizations Dataset:
   - No missing values
   - Employee count ranges from [min] to [max]
   - Founded years span from [min] to [max]

2. People Dataset:
   - No missing values
   - Birth dates from [min] to [max]
   - Gender distribution shows [majority/minority]

3. Customers Dataset:
   - No missing values
   - Subscription dates from [min] to [max]
   - Most common countries: [top 3]