<a href="https://colab.research.google.com/github/bhuguvi26/guviproject/blob/main/Copy_of_Server_Log_Data_Extraction_and_User_History_Database_Update.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Source code

In [1]:
# ============================================================
#   Google Colab ‚Äì Automated Email ETL (Downloads mbox.txt)
# ============================================================

!pip install -q pymongo dnspython

import os, re, sqlite3, sys
from datetime import datetime
from pymongo import MongoClient
from google.colab import files

# ---------------- CONFIG ----------------
LOG_FILE = "/content/mbox.txt"
MONGO_URI = "mongodb+srv://testuser:testuser@cluster0.obh30fm.mongodb.net/?appName=Cluster0"
MONGO_DB = "server_logs"
MONGO_COLLECTION = "user_history"
SQLITE_DB = "/content/user_history.db"

# ---------------- STEP 1: DOWNLOAD FILE ----------------
print("\n‚¨áÔ∏è Downloading mbox.txt from GitHub...")
!wget -q -O /content/mbox.txt "https://raw.githubusercontent.com/bhuguvi26/guviproject/main/mbox.txt"

if not os.path.exists(LOG_FILE):
    sys.exit("‚ùå Failed to download mbox.txt")

print("‚úÖ Download complete.\n")

# ---------------- STEP 2: Extract ----------------
def extract_email_date(filepath):
    print("üîç Extracting email addresses and dates...")
    from_pattern = re.compile(r'^From\s+([\w\.-]+@[\w\.-]+)\s+(.*)')

    data, seen = [], set()

    with open(filepath, encoding="utf-8", errors="ignore") as f:
        for line in f:
            line = line.strip()
            m = from_pattern.match(line)
            if m:
                email, datestr = m.group(1), m.group(2)

                # Try common date formats
                dt = None
                for fmt in ["%a %b %d %H:%M:%S %Y", "%a %b %d %H:%M:%S %z %Y"]:
                    try:
                        dt = datetime.strptime(datestr, fmt)
                        break
                    except:
                        pass

                if dt:
                    key = (email, dt.strftime("%Y-%m-%d %H:%M:%S"))
                    if key not in seen:
                        seen.add(key)
                        data.append({"email": key[0], "date": key[1]})

    print(f"‚úÖ Extracted {len(data)} email-date pairs.\n")
    return data

extracted = extract_email_date(LOG_FILE)
if not extracted:
    sys.exit("‚ùå No email-date pairs found ‚Äî check your file content!")

# ---------------- STEP 3: MongoDB ----------------
def upload_to_mongo(data):
    print("üåê Connecting to MongoDB Atlas...")
    try:
        client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
        client.admin.command("ping")
        print("‚úÖ MongoDB connection successful.")

        db = client[MONGO_DB]
        col = db[MONGO_COLLECTION]

        col.delete_many({})
        col.insert_many(data)

        print(f"‚úÖ Inserted {len(data)} documents into MongoDB.\n")
        return True
    except Exception as e:
        print(f"‚ö†Ô∏è MongoDB upload failed: {e}\nContinuing locally...")
        return False

mongo_ok = upload_to_mongo(extracted)

# ---------------- STEP 4: Fetch from MongoDB ----------------
def fetch_from_mongo():
    print("üì• Fetching data back from MongoDB...")
    try:
        client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
        db = client[MONGO_DB]
        col = db[MONGO_COLLECTION]
        recs = list(col.find({}, {"_id": 0}))
        print(f"‚úÖ Fetched {len(recs)} records from MongoDB.\n")
        return recs
    except Exception as e:
        print(f"‚ö†Ô∏è Fetch failed: {e}")
        return []

records = fetch_from_mongo() if mongo_ok else extracted

# ---------------- STEP 5: SQLite ----------------
def save_to_sqlite(records):
    print("üíæ Saving records to SQLite database...")
    conn = sqlite3.connect(SQLITE_DB)
    cur = conn.cursor()

    cur.execute("""
        CREATE TABLE IF NOT EXISTS user_history(
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            email TEXT NOT NULL,
            date TEXT NOT NULL
        )
    """)

    cur.executemany(
        "INSERT INTO user_history(email, date) VALUES (?, ?)",
        [(r['email'], r['date']) for r in records]
    )

    conn.commit()
    conn.close()

    print(f"‚úÖ Inserted {len(records)} rows into SQLite.\n")

save_to_sqlite(records)

# ---------------- STEP 6: SQL Queries ----------------
def run_sql_queries():
    print("üìä Running SQL analysis...\n")
    conn = sqlite3.connect(SQLITE_DB)
    cur = conn.cursor()

    queries = {
        "1Ô∏è‚É£ Unique Emails": "SELECT COUNT(DISTINCT email) FROM user_history;",
        "2Ô∏è‚É£ Emails per Day": "SELECT DATE(date), COUNT(*) FROM user_history GROUP BY DATE(date);",
        "3Ô∏è‚É£ First Email per Address": "SELECT email, MIN(date) FROM user_history GROUP BY email LIMIT 5;",
        "4Ô∏è‚É£ Top Domains": """
            SELECT SUBSTR(email, INSTR(email,'@')+1) AS domain, COUNT(*) AS cnt
            FROM user_history GROUP BY domain ORDER BY cnt DESC LIMIT 5;
        """
    }

    for name, q in queries.items():
        print(f"‚û°Ô∏è {name}")
        cur.execute(q)
        for row in cur.fetchall():
            print(row)
        print("-" * 50)

    conn.close()
    print("\n‚úÖ SQL analysis complete.\n")

run_sql_queries()

# ---------------- STEP 7: Download SQLite ----------------
print("üì¶ Downloading SQLite database...")
files.download(SQLITE_DB)
print("‚úÖ Done! Database saved locally.")


[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m1.7/1.7 MB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m331.1/331.1 kB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m
[?25h
‚¨áÔ∏è Downloading mbox.txt from GitHub...
‚úÖ Download complete.

üîç Extracting email addresses and dates...
‚úÖ Extracted 1795 email-date pairs.

üåê Connecting to MongoDB Atlas...
‚úÖ MongoDB connection successful.
‚úÖ Inserted 1795 documents into MongoDB.

üì• Fetching data back from MongoDB...
‚úÖ Fetched 1795 records from MongoDB.

üíæ Saving records to SQLite database...
‚úÖ Inserted 1795 rows into SQLite.

üìä Running SQL analysis...

‚û°Ô∏è 1Ô∏è‚É£ Unique Emails
(46,)
--------------------------------------------------
‚û°Ô∏è 2Ô∏è‚É£ Emails per Day
('2007-10-18', 13)
('2007-

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

‚úÖ Done! Database saved locally.


# New Section

# ReadMe
# User History Extraction & Analysis

## Overview
This project is designed to make sense of email activity recorded in server log files. Logs are usually unstructured and hard to analyze, so the goal here is to extract email addresses and timestamps, store them in databases, and run queries to get actionable insights. In short, it‚Äôs a small but complete data pipeline for tracking historical email activity.

---

## Problem
Server logs contain valuable information about user activity, but the raw data is messy. This project solves that by:
1. Extracting every email address along with its associated date and time.
2. Cleaning and standardizing the data.
3. Storing it in MongoDB and SQLite.
4. Running SQL queries to answer practical questions like ‚ÄúWho sent the most emails?‚Äù or ‚ÄúWhich email domains are most active?‚Äù

---

## Project Workflow
1. **Extract Emails and Dates** ‚Äì Reads the log file and captures all email addresses with their timestamps.  
2. **Transform Data** ‚Äì Formats dates in a standard `YYYY-MM-DD HH:MM:SS` format and structures the data for database insertion.  
3. **Save to MongoDB** ‚Äì Inserts cleaned data into a MongoDB collection (`user_history`).  
4. **Move to SQLite** ‚Äì Fetches data from MongoDB (or directly uses local data if MongoDB isn‚Äôt available) and inserts it into a relational table.  
5. **Analyze** ‚Äì Executes SQL queries to generate insights like unique users, email counts per day, first/last email dates, and top domains.

---

## Tools & Technologies
- **Python 3** ‚Äì Main scripting and data processing.  
- **MongoDB Atlas** ‚Äì NoSQL storage for document-based data.  
- **SQLite** ‚Äì Relational database for SQL analysis.  
- **Regular Expressions (Regex)** ‚Äì For extracting emails and dates.  
- **Google Colab** ‚Äì Supports file upload, processing, and SQLite download.  

---

## Project Structure
project/
‚îÇ
‚îú‚îÄ‚îÄ mbox.txt # Input server log file
‚îú‚îÄ‚îÄ user_history.db # Generated SQLite database
‚îú‚îÄ‚îÄ pipeline.py # Main Python script
‚îî‚îÄ‚îÄ README.md # This document
2. Upload Your Log File

The script will prompt you to upload mbox.txt.

Ensure log entries look like:

From user@example.com Sat Jan  5 09:14:16 2025

3. Run the Pipeline
python pipeline.py


The script will:

Extract email-date pairs.

Upload to MongoDB (if available).

Save the data into SQLite.

Run SQL queries to generate insights.


Example SQL Queries

Here are some sample queries to analyze the data:

Count unique email addresses

SELECT COUNT(DISTINCT email) FROM user_history;


Emails per day

SELECT DATE(date), COUNT(*) FROM user_history GROUP BY DATE(date);


First and last email per address

SELECT email, MIN(date) AS first_email, MAX(date) AS last_email FROM user_history GROUP BY email;


Top email domains

SELECT SUBSTR(email, INSTR(email,'@')+1) AS domain, COUNT(*) AS count
FROM user_history GROUP BY domain ORDER BY count DESC;


Total emails

SELECT COUNT(*) FROM user_history;


Top 5 users by email count

SELECT email, COUNT(*) AS count FROM user_history GROUP BY email ORDER BY count DESC LIMIT 5;


Emails from Gmail

SELECT COUNT(*) FROM user_history WHERE email LIKE '%@gmail.com';


Emails after a specific date

SELECT * FROM user_history WHERE date > '2025-01-01 00:00:00';


Emails per month

SELECT STRFTIME('%Y-%m', date) AS month, COUNT(*) FROM user_history GROUP BY month;


Custom queries

Any other analysis you need can be executed using standard SQL.