# Setting up environment
Load modules and connect to databases.

---

In [13]:
# Import libraries
from dotenv import load_dotenv
import os
from os import environ as env
import psycopg2
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import ipywidgets as widgets
from IPython.display import display

In [14]:
# Load environment variables
load_dotenv()
db_user = os.getenv('DB_USER')
db_password = os.getenv('DB_PASSWORD')
db_host = os.getenv('DB_HOST')
db_name = os.getenv('DB_NAME')

In [15]:
# Connect to the Stack Overflow database
conn = psycopg2.connect(
    host=db_host,
    database=db_name,
    user=db_user,
    password=db_password
)

In [16]:
# Define HEART metrics for Stack Overflow
heart_metrics = {
    'Happiness': 'Measure of user satisfaction and happiness with the Questions & Answers product.',
    'Engagement': 'Measure of user interaction and activity on the platform.',
    'Adoption': 'Measure of user acquisition and growth of Stack Overflow user base.',
    'Retention': 'Measure of user retention and continued usage of the platform over time.',
    'Task Success': 'Measure of user ability to accomplish their goals and tasks effectively on Stack Overflow.'
}

In [17]:
from sqlalchemy import create_engine


# With the env variables loaded we can insert them into the engine connection string.
engine = create_engine(f"postgresql+psycopg2://{env['DB_USER']}:{env['DB_PASSWORD']}@{env['DB_HOST']}/{env['DB_NAME']}")
connection = engine.connect()

### Test Connection

In [18]:
query = "SELECT * FROM users LIMIT 1"
df = pd.read_sql(query, connection)
df

Unnamed: 0,id,account_id,reputation,views,down_votes,up_votes,display_name,location,profile_image_url,website_url,about_me,creation_date,last_access_date
0,6650059,8909187,1,0,0,0,anngd,,,,,2016-07-28 13:52:12.933,2017-01-17 22:00:16.370


### Connect to Postgresql

In [10]:
%load_ext sql

In [11]:
%sql postgresql+psycopg2://{env['DB_USER']}:{env['DB_PASSWORD']}@{env['DB_HOST']}/{env['DB_NAME']}

---
---

# Retention Analysis
Measure of user retention and continued usage of the platform since 2009.

---

### User Retention Rate
User Retention = (Users at End - Users Onboarded) / Users at Start * 100

Users at end:

In [24]:
%%sql

SELECT COUNT(*)
FROM users
WHERE DATE_PART('year', last_access_date) >= 2023;

 * postgresql+psycopg2://analyst:***@terraform-20231114113407120500000001.cfmnnswnfhpn.eu-west-2.rds.amazonaws.com/stackoverflow
1 rows affected.


count
2604337


In [28]:
sql_query = """
SELECT COUNT(*)
FROM users
WHERE DATE_PART('year', last_access_date) >= 2023;
"""
users_at_end = pd.read_sql(sql_query, conn)

  users_at_end = pd.read_sql(sql_query, conn)


Users onboarded:

In [36]:
%%sql

SELECT COUNT(*)
FROM users
WHERE DATE_PART('year', creation_date) >= 2009;

 * postgresql+psycopg2://analyst:***@terraform-20231114113407120500000001.cfmnnswnfhpn.eu-west-2.rds.amazonaws.com/stackoverflow
1 rows affected.


count
19921152


In [37]:
sql_query_2 = """
SELECT COUNT(*)
FROM users
WHERE DATE_PART('year', creation_date) >= 2009;
"""
users_onboarded = pd.read_sql(sql_query_2, conn)

  users_onboarded = pd.read_sql(sql_query_2, conn)
