In [31]:
import requests,json,time
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import numpy
import math
import copy
import random
%run 'D:\dev\prod\allStuffHelper.py'

class DirectReport(object):
    """Get Direct Report"""

    def __init__(self, token, columns, campaignId, startDate, endDate):
        self._token = token
        self._url = 'https://api.direct.yandex.ru/live/v4/json/'
        self._columns = columns
        self._campaignId = campaignId
        self._startDate = startDate
        self._endDate = endDate

    def _createNewReportPayload(self):
        payload = {
        "method": "CreateNewReport",
        "token": self._token,
        "param": {
            "CampaignID": self._campaignId,
            "StartDate": self._startDate,
            "EndDate": self._endDate,
            "GroupByColumns": self._columns,
            "GroupByDate": "day",
            "TypeResultReport": "xml",
            "CompressReport": 0}
            }
        return payload

    def _createJsonUTF8Payload(self,payload):
        return json.dumps(payload, ensure_ascii=False).encode('utf8')

    def _createReport(self):
        payload = self._createJsonUTF8Payload(self._createNewReportPayload())
        return requests.post(self._url,payload).text

    def _getReportList(self):
        payload = self._createJsonUTF8Payload({
            "method": "GetReportList",
            "token": self._token
            })
        return requests.post(self._url,payload).text

    def _getReport(self,reportID):
        reportList = self._getReportList()
        reportItem = (item for item in json.loads(reportList)['data'] if item["ReportID"] == json.loads(reportID)['data']).next()
        if reportItem['StatusReport'] == 'Done':
            data = requests.get(reportItem['Url']).text
            return data
        else:
            return False

    def _deleteReport(self,reportID):
        payload = self._createJsonUTF8Payload({
            "method": "DeleteReport",
            "token": self._token,
            "param": reportID
            })
        return requests.post(self._url,payload).text

    def _getReportData(self):
        reportId = self._createReport()
        time.sleep(100)
        report = self._getReport(reportId)
        print self._deleteReport(json.loads(reportId)['data'])
        return report

class DirectReportProcessing(object):
    """Direct report data processing tasks: parse XML, create Pandas DataFrame"""

    def __init__(self, report):
        self._report = report
        self._data = BeautifulSoup(self._report,'lxml').report
        self._campaignId = self._data.campaignid.get_text()

    def _parseXML(self):
        length = int(self._data.stat.attrs['rows'])
        print 'Direct report length = {0}\t'.format(length)
        if length > 0:
            stats = []
            for tag in self._data.stat.find_all('row'):
                stats.append(tag.attrs)
            print 'Length check: {0}'.format(str(len(stats)==length))
        return stats

    def _createDataFrame(self,stats):
        df_stats = pd.DataFrame(stats)
        df_stats['campaignId'] = self._campaignId
        return self._castDataFrame(df_stats)

    def _castDataFrame(self,df):
        df['clicks'] = df['clicks'].astype(int)
        df['clicks_context'] = df['clicks_context'].astype(int)
        df['clicks_search'] = df['clicks_search'].astype(int)
        df['shows'] = df['shows'].astype(int)
        df['shows_context'] = df['shows_context'].astype(int)
        df['shows_search'] = df['shows_search'].astype(int)
        df['sum'] = df['sum'].astype(float)*30.0
        df['sum_context'] = df['sum_context'].astype(float)*30.0
        df['sum_search'] = df['sum_search'].astype(float)*30.0
        df['campaignId'] = df['campaignId'].astype(int)
        return df

def getMetrikaData (projectInfo,date1,date2):
    dimensions = 'ym:s:directID,ym:s:deviceCategory,ym:s:gender,ym:s:ageInterval'
    goalsMetrics = ','.join(['ym:s:goal'+str(target['targetid'])+'visits' for target in getCPATargetsByProjectID(projectInfo['projectid'])])
    otherMetrics = 'ym:s:visits,ym:s:bounceRate'
    metrics = ','.join([otherMetrics,goalsMetrics])
    filters = ""
    attribution = 'lastsign'
    metrikaData = getMetrikaReportInDataFrame(date1,date2,metrics,dimensions,projectInfo['token'],projectInfo['counterId'],filters,attribution)
    metrikaGoalsData = mergeCPAVisitsColumns(dropTotalRow(metrikaData))
    return metrikaGoalsData

def processMetrikaData (metrikaData):
    df = metrikaData
    df.rename(columns={u'﻿"Идентификатор кампании для директа"': 'campaignId',
                   u'Тип устройства':'device_type',
                   u'Пол': 'gender',
                   u'Возраст': 'age',
                  u'Визиты':'visits',
                  u'Отказы':'bounceRate'},
          inplace=True)
    df = df[~df[u'device_type'].isin([u'ТВ'])]
    df['bounces'] = df['visits']*df['bounceRate']
    df['conversion_rate'] = df['goal_visits']/df['visits']
    df['campaignId'] = df['campaignId'].str.replace('N-','').astype(int)
    df.replace(to_replace={'device_type':{u'ПК':'desktop',u'Смартфоны':'mobile',u'Планшеты':'tablet'},
                          'age':{u'младше 18 лет':'AGE_0_17',u'18‑24 года':'AGE_18_24',u'25‑34 года':'AGE_25_34',u'35‑44 года':'AGE_35_44',u'45 лет и старше':'AGE_45',u'Не определено':'AGE_UNKNOWN'},
                          'gender':{u'женский':'GENDER_FEMALE',u'мужской':'GENDER_MALE',u'Не определено':'GENDER_UNKNOWN'}},inplace=True)
    return df

def mergeDataFrames(metrikaData,directData):
    merged_df = pd.merge(metrikaData,directData,on=['age','device_type','gender','campaignId'],how='inner')
    merged_df['cpv'] = merged_df['sum']/merged_df['visits']
    print 'Check merged dataframe length:',len(metrikaData)==len(merged_df)
    return merged_df

def groupData(df):
    cleaned_df = df[~df['gender'].isin(['GENDER_UNKNOWN'])]
    cleaned_df = cleaned_df[~cleaned_df['age'].isin(['AGE_UNKNOWN'])]
    cleaned_df = cleaned_df[['campaignId','age','gender','sum','visits','goal_visits']].groupby(['campaignId','age','gender'],as_index=False).sum()
    return cleaned_df

class GenderAgeDataframe():
    """Calculate gender-age dataframe with bid adjustments"""

    def _calculateGenderAgeDataframe(self,df):
        if df['goal_visits'].sum() == 0:
            return pd.DataFrame()
        gender_age_dataframe = df
        gender_age_dataframe['conversed_cpa'] = gender_age_dataframe['goal_visits']/gender_age_dataframe['sum']
        gender_age_dataframe['conversed_cpa'] = gender_age_dataframe['conversed_cpa'].replace(np.inf, np.nan).fillna(0)
        gender_age_dataframe['conversion_rate'] = gender_age_dataframe['goal_visits']/gender_age_dataframe['visits']
        gender_age_dataframe['cpv'] = gender_age_dataframe['sum']/gender_age_dataframe['visits']
        gender_age_visits = gender_age_dataframe['visits'].sum()
        gender_age_sum = gender_age_dataframe['sum'].sum()
        gender_age_goal_visits = gender_age_dataframe['goal_visits'].sum()
        gender_age_conversion_rate = gender_age_goal_visits/gender_age_visits
        gender_age_conversed_cpa = gender_age_goal_visits/gender_age_sum
        gender_age_conversed_cpa_std = gender_age_dataframe['conversed_cpa'].std()
        gender_age_dataframe['sqrt_sum'] = gender_age_dataframe['sum'].apply(numpy.sqrt)
        gender_age_dataframe['z_criteria'] = (gender_age_dataframe['conversed_cpa'] - gender_age_conversed_cpa)/(gender_age_conversed_cpa_std/gender_age_dataframe['sqrt_sum'])
        gender_age_dataframe['z_criteria'] = gender_age_dataframe['z_criteria'].replace(np.inf, np.nan).fillna(0)
        return gender_age_dataframe

    def _calculateBidAdjsutemnts(self,gender_age_df,min_bid_adjustment,max_bid_adjustment):
        gender_age_df['min_bid_adj'] = min_bid_adjustment
        gender_age_df['max_bid_adj'] = max_bid_adjustment
        maximum_z_criteria = gender_age_df['z_criteria'].max()
        minimum_z_criteria = gender_age_df['z_criteria'].min()
        delta_z_criteria = maximum_z_criteria - minimum_z_criteria
        adjustment_step = (max_bid_adjustment-min_bid_adjustment)/delta_z_criteria
        gender_age_df['bid_adjustment'] = (gender_age_df['z_criteria'] + abs(minimum_z_criteria))*adjustment_step + min_bid_adjustment
        gender_age_df['new_visits'] = gender_age_df['visits']*gender_age_df['bid_adjustment']
        gender_age_df['new_cpv'] = gender_age_df['cpv']*gender_age_df['bid_adjustment']
        gender_age_df['new_sum'] = gender_age_df['new_visits']*gender_age_df['new_cpv']
        gender_age_df['new_goal_visits'] = gender_age_df['new_visits']*gender_age_df['conversion_rate']
        gender_age_df['new_conversed_cpa'] = gender_age_df['new_goal_visits']/gender_age_df['new_sum']
        gender_age_df['new_conversed_cpa'] = gender_age_df['new_conversed_cpa'].replace(np.inf, np.nan).fillna(0)
        return gender_age_df

    def _searchOptimalCalculation(self,df):
        MAX_BID_ADJ = 119
        MIN_BID_ADJ = 6
        MAX_DIFFERENCE = 0.15
        optimalCalculations = pd.DataFrame()
        for min_bid_adjsutment in xrange(MIN_BID_ADJ,10):
            for max_bid_adjsutment in xrange(11,MAX_BID_ADJ+1):
                min_bd = min_bid_adjsutment/10.0
                max_bd = max_bid_adjsutment/10.0
#                 print min_bid_adjsutment/10.0,max_bid_adjsutment/10.0
                calculation = self._calculateBidAdjsutemnts(df,min_bd,max_bd)
                old_sum = calculation['sum'].sum()
                new_sum = calculation['new_sum'].sum()
                old_visits = calculation['visits'].sum()
                new_visits = calculation['new_visits'].sum()
                if (abs(old_sum/new_sum-1) < MAX_DIFFERENCE) & (abs(old_visits/new_visits-1) < MAX_DIFFERENCE):
                    optimalCalculations = pd.concat([optimalCalculations,calculation])
        return optimalCalculations

    def _getBestCalculation(self,optimalCalculations):
        min_bid_adj = optimalCalculations[['min_bid_adj','max_bid_adj','new_goal_visits']].groupby(['min_bid_adj','max_bid_adj'],as_index=False).sum().sort_values('new_goal_visits',ascending=False).iloc[0]['min_bid_adj']
        max_bid_adj = optimalCalculations[['min_bid_adj','max_bid_adj','new_goal_visits']].groupby(['min_bid_adj','max_bid_adj'],as_index=False).sum().sort_values('new_goal_visits',ascending=False).iloc[0]['max_bid_adj']
        bestCalculation = optimalCalculations[(optimalCalculations['min_bid_adj'] == min_bid_adj) & (optimalCalculations['max_bid_adj'] == max_bid_adj)]
        return bestCalculation

In [35]:
project = u'the-alba.com'
VipProjectInfo = getExtendedProjectInfo(project,'Metrika')
print VipProjectInfo
date1 = '2016-02-01'
date2 = '2016-08-01'
metrika_df = getMetrikaData(VipProjectInfo,date1,date2)
metrika_df = processMetrikaData(metrika_df)

{'domain': 'the-alba.com', 'metrikapassword': '2015_theALBA', 'metrikalogin': 'thealbashoes', 'projectid': 2455189L, 'counterId': 17295088, 'token': u'AQAAAAAUBXcjAAFxmDgRz3t_K0c-mP3pXOSEih8'}
https://beta.api-metrika.yandex.ru/stat/v1/data.csv?dimensions=ym:s:directID,ym:s:deviceCategory,ym:s:gender,ym:s:ageInterval&metrics=ym:s:visits,ym:s:bounceRate,ym:s:goal22026910visits,ym:s:goal20745715visits,ym:s:goal20690475visits,ym:s:goal20690255visits&date1=2016-02-01&date2=2016-08-01&ids=17295088&oauth_token=AQAAAAAUBXcjAAFxmDgRz3t_K0c-mP3pXOSEih8&limit=100000&offset=1&filters=&accuracy=full&attribution=lastsign


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [36]:
metrika_df['campaignId'].unique()

array([19737886, 19504685, 19504691, 19827935, 19504759, 19504772,
       19504781, 20236209, 20610193], dtype=int64)

In [37]:
columns = ['clDemographics','clDeviceType']
directToken = GetToken('alexgrtula','Direct')
excluded_campaigns = [7217214,7259041,7233717,7217230]
campaigns_list = [int(x) for x in list(metrika_df['campaignId'].unique()) if x not in excluded_campaigns]
direct_stats = []
for campaignId in campaigns_list:
    print 'Create report: {0}'.format(campaignId)
    dr = DirectReport(directToken,columns,campaignId,date1,date2)
    reportData = dr._getReportData()
    drp = DirectReportProcessing(reportData)
    campaign_stats_xml = drp._parseXML()
    campaign_stats_df = drp._createDataFrame(campaign_stats_xml)
    direct_stats.append(campaign_stats_df)

Create report: 19737886
{"data":1}
Direct report length = 47	
Length check: True
Create report: 19504685
{"data":1}
Direct report length = 54	
Length check: True
Create report: 19504691
{"data":1}
Direct report length = 54	
Length check: True
Create report: 19827935
{"data":1}
Direct report length = 54	
Length check: True
Create report: 19504759
{"data":1}
Direct report length = 39	
Length check: True
Create report: 19504772
{"data":1}
Direct report length = 42	
Length check: True
Create report: 19504781
{"data":1}
Direct report length = 53	
Length check: True
Create report: 20236209
{"data":1}
Direct report length = 43	
Length check: True
Create report: 20610193


StopIteration: 

In [40]:
merged_df = []
bestCalcs = pd.DataFrame()
for direct_df in direct_stats:
    campaignId = direct_df['campaignId'][0]
    metrika_criteria_df = metrika_df[metrika_df['campaignId'] == campaignId]
    merged_df = groupData(mergeDataFrames(metrika_criteria_df,direct_df))
    gad = GenderAgeDataframe()
    genage_df = gad._calculateGenderAgeDataframe(merged_df)
    if (len(genage_df) > 0):
        if genage_df['conversed_cpa'].mean() != 0:
            optimalCalculations = gad._searchOptimalCalculation(genage_df)
            bestCalcs = pd.concat([bestCalcs,gad._getBestCalculation(optimalCalculations)])

Check merged dataframe length: False
Check merged dataframe length: True
Check merged dataframe length: True
Check merged dataframe length: True
Check merged dataframe length: True
Check merged dataframe length: True
Check merged dataframe length: True
Check merged dataframe length: True


In [42]:
bestCalcs.to_excel('D:\\_Docs_\\2016-07\\2016-07-26 Gender Age Adjustments\\the-alba.xlsx')

In [43]:
bestCalcs

Unnamed: 0,campaignId,age,gender,sum,visits,goal_visits,conversed_cpa,conversion_rate,cpv,sqrt_sum,z_criteria,min_bid_adj,max_bid_adj,bid_adjustment,new_visits,new_cpv,new_sum,new_goal_visits,new_conversed_cpa
0,19737886,AGE_0_17,GENDER_FEMALE,30.3,39,0.0,0.0,0.0,0.776923,5.504544,-2.877681,0.6,1.6,0.982586,38.320866,0.763394,29.253918,0.0,0.0
1,19737886,AGE_0_17,GENDER_MALE,1.8,5,0.0,0.0,0.0,0.36,1.341641,-0.701387,0.6,1.6,1.033301,5.166503,0.371988,1.921878,0.0,0.0
2,19737886,AGE_18_24,GENDER_FEMALE,88.8,130,0.0,0.0,0.0,0.683077,9.423375,-4.926378,0.6,1.6,0.934845,121.529913,0.638571,77.605523,0.0,0.0
3,19737886,AGE_18_24,GENDER_MALE,9.0,54,0.0,0.0,0.0,0.166667,3.0,-1.568348,0.6,1.6,1.013098,54.707275,0.16885,9.237302,0.0,0.0
4,19737886,AGE_25_34,GENDER_FEMALE,1673.7,549,9.0,0.005377,0.016393,3.048634,40.910879,4.448861,0.6,1.6,1.153317,633.170922,3.516041,2226.254706,10.379851,0.004662
5,19737886,AGE_25_34,GENDER_MALE,139.8,168,3.0,0.021459,0.017857,0.832143,11.823705,23.617338,0.6,1.6,1.6,268.8,1.331429,357.888,4.8,0.013412
6,19737886,AGE_35_44,GENDER_FEMALE,570.6,290,5.0,0.008763,0.017241,1.967586,23.887235,12.094992,0.6,1.6,1.331495,386.13345,2.619831,1011.604197,6.657473,0.006581
7,19737886,AGE_35_44,GENDER_MALE,50.4,52,1.0,0.019841,0.019231,0.969231,7.099296,12.831541,0.6,1.6,1.348658,70.13024,1.307161,91.671534,1.348658,0.014712
8,19737886,AGE_45,GENDER_FEMALE,1362.3,280,0.0,0.0,0.0,4.865357,36.909348,-19.295573,0.6,1.6,0.6,168.0,2.919214,490.428,0.0,0.0
9,19737886,AGE_45,GENDER_MALE,117.0,106,0.0,0.0,0.0,1.103774,10.816654,-5.654761,0.6,1.6,0.917872,97.29443,1.013123,98.571211,0.0,0.0
