# Script to access SQL and start to play with data

This code will access data for my company from their space on a RedShift cluster so that I can analyze it and look for patterns to predict both retention (who sticks around) and virality (who is more likely to get other people to use the program).

In [None]:
## Python packages - you may have to pip install sqlalchemy, sqlalchemy_utils, and psycopg2.
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import psycopg2
import pandas as pd

import re # to access database details in a file
import matplotlib.pyplot as plt
from datetime import datetime # to manipulate dates

% matplotlib inline

In [None]:
path='/Users/brianna/Dropbox/Insight/WorkLife/WL_project/'

### Make a function to access each database

To keep the access details secure, keep them in a separate file that will remain private, even when the code is shared.

In [None]:
def connect_db(which_db):
    # Grab the details from a text file for how to access the database
    db_file = open(path+which_db+'DBdeets.txt','r')
    db_deets = db_file.read()

    dbname=re.findall('dbname=(\S+)',db_deets)
    username=re.findall('username=(\S+)',db_deets)
    hostname=re.findall('hostname=(\S+)',db_deets)
    portname=re.findall('portname=(\S+)',db_deets)
    pw=re.findall('pw=(\S+)',db_deets)
    
    # Connect to the database.  (If you can't, jump out and give a polite notice!)
    con = None
    try:
        con = psycopg2.connect(database = dbname[0], 
                               user = username[0], 
                               host = hostname[0], 
                               port = portname[0], 
                               password = pw[0])
        # Set up a cursor.  (Anytime you get an error with the cursor, you have to reset the 
        # connection with it)
        cur=con.cursor()
        print('I connected to the '+which_db+' database!!')
    except:
        print('Unable to connect to '+which_db+' database :(')
    
    db_file.close()
    
    return  con, cur

### Open the two databases where data are stored.

The 'events' database has a main table called 'events131567', which has a bunch of sub-tables with information on the events completed by the user.

The 'transactions' database has a bunch of tables with data about the user:
pg_stat_statements,
activities,
agenda_templates,
calendar_webhooks,
calendars,
contacts,
integration_authorizations,
integration_providers,
integrations,
meeting_seeds,
migrations,
notifications,
password_reset_tokens,
meetings,
short_urls,
team_invitations,
team_memberships,
teams,
users,
events,
items,
recurring_event_rules,
meeting_invitations


In [None]:
# Access the 'events' database.
conE, curE = connect_db('evnt')

# Access the 'transactions' database.
conT, curT = connect_db('trns')


In [None]:
# People from companies viewing a meeting details page by week
# event data

# Select 2 columns of data (date, number of unique users)
# from the table which holds data from events where people viewed the meeting_details
# page.  Then group the data by week.

sql_query="""
select 
    date_trunc('week', event_time)::date as week,
    count(distinct user_id) as users
from app131567.viewed_meeting_details_page
group by date_trunc('week', event_time)::date
order by week asc
"""

query_result = pd.read_sql_query(sql_query,conE)

df = query_result.iloc[:]

In [None]:
df.head()

In [None]:
date_cutoff = datetime.strptime('2015-08-02', '%Y-%m-%d').date()

xlabel='Week'
ylabel='Number of Users logged in (at least once)'
plt.figure(figsize=(20,10))
plt.xlabel(xlabel, fontsize=25)
plt.ylabel(ylabel, fontsize=25)

# histogram of count of weeks
#df.groupby('week').size().plot(kind='bar')

plt.plot(df.week[df.week > date_cutoff], df.users[df.week > date_cutoff])
plt.plot(df.week, df.users)
#plt.scatter(query_result.iloc[:]['users'], query_result.iloc[:]['events'], color="red")
#plt.scatter(df.users[df.users < 500], df.events[df.users < 500], color="red")
#plt.scatter(df.users[df.users < 100], df.events[df.users < 100], color="red")
#plt.scatter(df.users[df.users > 1500], df.events[df.users > 1500], color="red")
#plt.scatter(df.users, df.events, color="red")

plt.savefig(path+xlabel+'BY'+ylabel+'.png')


In [None]:
# Trailing 7 day calendar events for a specific user id

# sql_query="""
# select c.user_id, c.source_id, c.type, c.active, c.title, e.* 
# from calendars c, events e
# where c.id = e.calendar_id
# and c.user_id = '54d14f9453832673fdd7c29c'
# and lower(time_range) between (current_date - INTERVAL '7 days') and current_date
# """

# calendars variables:
# id, type, source_id, active, raw, created_at, updated_at, sync, legacy, last_event_sync_at, last_calendar_sync_at,
# deleted_at, user_id, needs_sync, integration_id, title

# event variables:
# id, type, source_id, calendar_id, raw, created_at, updated_at, time_range, legacy, ignored, title, location, 
# source_url, attendees, global_id, recurring_event_id

sql_query="""
select e.*
from calendars c, events e
where c.id = e.calendar_id
and c.user_id = '54d14f9453832673fdd7c29c'
and lower(time_range) between (current_date - INTERVAL '7 days') and current_date
"""

query_result = pd.read_sql_query(sql_query,conT)

In [None]:
query_result.iloc[1]

In [None]:
query_result.iloc[:]['source_id']