In [2]:
import pandas as pd
import datetime as dt
from datetime import timedelta
from sqlalchemy import create_engine

In [3]:
data = pd.read_csv('transactions-comp.csv')
data['timestamp'] = pd.to_datetime(data['timestamp'])

def get_trans_count(i):
    delta = i['timestamp'] - timedelta(hours=6)
    data_date_slice = data[(data['timestamp'] <= i['timestamp']) & (data['timestamp'] >= delta)]
    data_slice = data_date_slice[(data_date_slice['org'] == i['org']) & (data_date_slice['form'] == i['entity'])]    
    one_time_trans = data_slice[data_slice['recurring'] == 0]
    return len(one_time_trans)

def get_trans_vol(i):
    delta = i['timestamp'] - timedelta(hours=6)
    data_date_slice = data[(data['timestamp'] <= i['timestamp']) & (data['timestamp'] >= delta)]
    data_slice = data_date_slice[(data_date_slice['org'] == i['org']) & (data_date_slice['form'] == i['entity'])]    
    one_time_trans = data_slice[data_slice['recurring'] == 0]
    return one_time_trans['donations_amt'].sum()

In [4]:
disk_engine = create_engine('sqlite:///analytics_2015-2016.db')

## Insert analytics into SQLite DB

In [5]:
start = dt.datetime.now()
chunksize = 2000
j = 0
index_start = 1

for df in pd.read_csv('analytic_base_2013-2014.csv', chunksize=chunksize, iterator=True, encoding='utf-8'):
    # remove possible spaces from column names
    df = df.rename(columns={c: c.replace(' ', '') for c in df.columns})
    # convert time stamp string to datetime
    df['timestamp'] = pd.to_datetime(df['tm_stamp'])
    df.drop('tm_stamp', axis=1)
    df['one_time_trans_count'] = df.apply(get_trans_count, axis=1)
    df['one_time_trans_vol'] = df.apply(get_trans_vol, axis=1)
    
    df.index += index_start
    j += 1
    print '{} seconds: completed {} rows'.format((dt.datetime.now() - start).seconds, j*chunksize)
    
    df.to_sql('base', disk_engine, if_exists='append')
    index_start = df.index[-1] + 1

35 seconds: completed 2000 rows
71 seconds: completed 4000 rows
106 seconds: completed 6000 rows
141 seconds: completed 8000 rows
181 seconds: completed 10000 rows
216 seconds: completed 12000 rows
249 seconds: completed 14000 rows
283 seconds: completed 16000 rows
318 seconds: completed 18000 rows
353 seconds: completed 20000 rows
390 seconds: completed 22000 rows
424 seconds: completed 24000 rows
458 seconds: completed 26000 rows
490 seconds: completed 28000 rows
522 seconds: completed 30000 rows
559 seconds: completed 32000 rows
594 seconds: completed 34000 rows
626 seconds: completed 36000 rows
658 seconds: completed 38000 rows
690 seconds: completed 40000 rows
722 seconds: completed 42000 rows
755 seconds: completed 44000 rows
787 seconds: completed 46000 rows
820 seconds: completed 48000 rows
852 seconds: completed 50000 rows
884 seconds: completed 52000 rows
916 seconds: completed 54000 rows
950 seconds: completed 56000 rows
981 seconds: completed 58000 rows
1014 seconds: comple

In [10]:
analytics = pd.read_csv('analytic_base_2013-2014.csv')

In [15]:
data.tail()

Unnamed: 0,id,org,form,recurring,recurring_creatingTransaction,creatingTransactionFor,timestamp,state,zip,amount,donations_count,donations_amt,events_count,events_tickets,events_amt,registrations_count,purchases_count,purchases_quantity,purchases_amt
916690,3310773,421,393,0,0,0,2016-06-24 15:07:53,Maryland,21117,100.0,1,100,0,0,0,0,0,0,0
916691,3310774,14268,820342,0,0,0,2016-06-24 15:08:58,Minnesota,55056,25.0,1,25,0,0,0,0,0,0,0
916692,3310775,32752,111684,0,0,0,2016-06-24 15:09:11,,71302,50.0,2,50,0,0,0,0,0,0,0
916693,3310776,1404,23591,0,0,0,2016-06-24 15:09:53,Florida,33803,180.0,1,180,0,0,0,0,0,0,0
916694,3310777,600,572,0,0,0,2016-06-24 15:12:14,Illinois,61244,51.25,1,50,0,0,0,0,0,0,0


## insert qgiv analytics data

In [34]:
# init sqlite connection
disk_engine = create_engine('sqlite:///analytics_2015-2016.db')

In [20]:
qgiv_analytics = pd.read_csv("analytic_qgiv_2015-2016.csv")
qgiv_stats_analytics = pd.read_csv("analytic_qgiv_stats_2015-2016.csv")

In [24]:
qgiv_stats = pd.merge(qgiv_stats_analytics, qgiv_analytics, how="left", left_on="id", right_on="stats")

In [None]:
qgiv_stats.to_sql('qgiv_stats', disk_engine, if_exists='append')

## tests

In [None]:
#analytics = pd.read_csv('analytic_base_2015-2016.csv')
#analytics['timestamp'] = pd.to_datetime(analytics['tm_stamp'])
#analytics = analytics.drop('tm_stamp', axis=1)

In [None]:
#sorted_analytics = analytics.sort('one_time_trans_count').head()
#print len(sorted_analytics)
#print sorted_analytics.head()
#print sorted_analytics.tail()