# Limpieza y creación del dataframe

In [18]:
import numpy as np
import pandas as pd
import sqlite3
import os

In [45]:
np.random.seed(42)

In [3]:
connection = sqlite3.connect("data/BDTerra.db")

In [4]:
crsr = connection.cursor()

In [5]:
def sql_query(query):    # Ejecuta la query
    crsr.execute(query)    # Almacena los datos de la query
    ans = crsr.fetchall()    # Obtenemos los nombres de las columnas de la tabla
    names = [description[0] for description in crsr.description]    
    return pd.DataFrame(ans,columns=names)

In [None]:
query1 = """
SELECT Request
FROM ISSUES
"""

In [46]:
query2 = """
SELECT * 
FROM Projects
"""

In [10]:
df = sql_query(query1)

In [47]:
df_project = sql_query(query2)

In [48]:
df_project

Unnamed: 0,Project ID,Name,Project Type,Client ID,Budget,Project Manager ID,Beggining date,Contact ID
0,PC101,RipplePC101,Strategy,C1,1572929.92,E4,2017-07-06,COC101
1,PC102,RipplePC102,Strategy,C1,1410643.5,E5,2020-09-28,COC101
2,PC103,RipplePC103,Website,C1,1373192.78,E5,2017-11-08,COC101
3,PC104,RipplePC104,Campaign,C1,1285807.79,E4,2020-03-13,COC101
4,PC105,RipplePC105,Website,C1,1473061.35,E5,2022-06-16,COC101
5,PC106,RipplePC106,Website,C1,1585413.49,E5,2021-04-13,COC101
6,PC107,RipplePC107,Website,C1,1160972.08,E5,2015-08-20,COC101
7,PC201,RipplePC201,Strategy,C2,1368375.74,E4,2016-03-25,COC201
8,PC202,RipplePC202,Website,C2,1275288.95,E4,2018-01-18,COC201
9,PC203,RipplePC203,Strategy,C2,605064.1,E4,2016-09-14,COC201


In [13]:
df.columns

Index(['Issue ID', 'Project ID', 'Classification', 'Screenshot', 'Urgency',
       'Input Date', 'Deadline Theor', 'Deadline Real', 'Employee ID',
       'Device ', 'Browser', 'Page', 'Contact ID', 'Request'],
      dtype='object')

In [15]:
df.sort_values(by = "Issue ID", inplace=True)

In [28]:
df["Classification"].unique()

array(['Design issues', 'Copy issues', 'New item', 'Request change',
       'Not addressing', 'Bug fix'], dtype=object)

In [32]:
iteraciones_medias = {
    "Copy issues": (4,1),
    "Request change": (8,2),
    "Design issues": (4,2),
    "New item": (10,4),
    "Bug fix": (8,3),
    "Not addressing": (1,0.2)
}

In [33]:
def genera_iteraciones(clase):
    for i in df["Classification"].unique():
        if clase == i:
            med = iteraciones_medias[i][0]
            var = iteraciones_medias[i][1]
            return np.random.normal(med,var)

In [34]:
it = []
for i in df["Classification"]:
    it.append(genera_iteraciones(i))

In [41]:
it = np.trunc(it).astype("uint8")

In [42]:
df["Iteraciones"] = it

In [44]:
df["Iteraciones"].value_counts(normalize=True)

Iteraciones
4      0.220351
3      0.211404
5      0.128421
2      0.093158
6      0.072456
1      0.057719
7      0.048421
8      0.042105
0      0.038070
9      0.033509
10     0.022281
11     0.009474
12     0.009298
13     0.004561
255    0.001754
14     0.001754
17     0.001404
15     0.001228
16     0.000877
254    0.000877
18     0.000351
21     0.000175
20     0.000175
19     0.000175
Name: proportion, dtype: float64

In [53]:
def inicio_proyecto(proj_id):
    return pd.to_datetime(df_project[df_project["Project ID"] == proj_id]["Beggining date"].values[0])


In [70]:
def timelapse(issue_id):
    date = df[df["Issue ID"] == issue_id]["Input Date"].values[0]
    # print(date)
    proj_id = df[df["Issue ID"] == issue_id]["Project ID"].values[0]
    return max(pd.to_datetime(date) - pd.DateOffset(days=30), inicio_proyecto(proj_id))

In [62]:
df["Input Date"] = pd.to_datetime(df["Input Date"])

In [69]:
df.head()

Unnamed: 0,Issue ID,Project ID,Classification,Screenshot,Urgency,Input Date,Deadline Theor,Deadline Real,Employee ID,Device,Browser,Page,Contact ID,Request,Iteraciones
3855,IPC100101,PC1001,Design issues,0,,2015-09-05,,2015-09-06,E2,Mobile,Mozilla,True,COC1001,The spacing around section titles [645] should...,5
622,IPC1001010,PC1001,Copy issues,0,,2015-02-06,,2015-02-08,E1,Desktop,Safari,True,COC1001,Please rewrite for clarity the CTA button labe...,5
4500,IPC10010100,PC1001,New item,0,,2014-09-14,,2014-09-16,E3,Desktop,Chrome,True,COC1001,"Please add a FAQs segmented by user type, to m...",4
4555,IPC1001011,PC1001,New item,0,,2015-06-14,,2015-06-17,E3,Desktop,Chrome,True,COC1001,Please add a customizable data visualization b...,5
620,IPC1001012,PC1001,Copy issues,1,,2015-05-29,,2015-06-01,E1,Desktop,Safari,True,COC1001,Please emphasize benefits over features the Ca...,3


In [82]:
def suma_it(issue_id):
    fecha_30 = timelapse(issue_id)
    # print(fecha_30)
    input = df[df["Issue ID"] == issue_id]["Input Date"].values[0]
    # print(input)
    proj_id = df[df["Issue ID"] == issue_id]["Project ID"].values[0]
    aux = df[(df["Project ID"] == proj_id) & (df["Input Date"].between(fecha_30,input))]
    # print(aux)
    return int(aux["Iteraciones"].sum())


In [86]:
sumas = []
for i in df["Issue ID"]:
    sumas.append(suma_it(i))
sumas

[34,
 53,
 7,
 37,
 59,
 55,
 25,
 28,
 40,
 21,
 32,
 30,
 45,
 38,
 60,
 27,
 63,
 68,
 13,
 60,
 29,
 51,
 47,
 71,
 11,
 31,
 38,
 36,
 48,
 62,
 47,
 48,
 56,
 35,
 22,
 51,
 55,
 59,
 28,
 64,
 37,
 22,
 26,
 59,
 47,
 50,
 7,
 32,
 19,
 50,
 54,
 69,
 15,
 22,
 67,
 44,
 8,
 29,
 35,
 64,
 40,
 58,
 68,
 14,
 28,
 35,
 13,
 53,
 34,
 29,
 67,
 37,
 43,
 40,
 30,
 47,
 52,
 46,
 50,
 30,
 56,
 50,
 46,
 68,
 47,
 56,
 40,
 27,
 11,
 35,
 24,
 62,
 40,
 31,
 29,
 42,
 35,
 62,
 28,
 37,
 32,
 27,
 30,
 38,
 34,
 31,
 38,
 85,
 30,
 55,
 43,
 34,
 41,
 32,
 33,
 54,
 27,
 37,
 54,
 64,
 54,
 47,
 30,
 60,
 30,
 28,
 9,
 25,
 31,
 84,
 54,
 33,
 57,
 37,
 47,
 38,
 50,
 47,
 57,
 31,
 50,
 60,
 34,
 35,
 48,
 47,
 45,
 59,
 23,
 30,
 49,
 28,
 60,
 43,
 41,
 55,
 60,
 49,
 43,
 85,
 34,
 27,
 26,
 43,
 25,
 46,
 35,
 33,
 33,
 38,
 22,
 32,
 29,
 24,
 32,
 36,
 84,
 35,
 42,
 44,
 30,
 35,
 30,
 59,
 48,
 46,
 36,
 4,
 43,
 55,
 57,
 28,
 27,
 30,
 53,
 39,
 55,
 27,
 84,
 17,
 44,


In [89]:
df["Iteraciones 30 dias"] = sumas

In [93]:
df["Iteraciones 30 dias"].sort_values().unique()

array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
        27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
        40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
        53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
        66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
        79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,
        92,  93,  94,  95,  97,  98,  99, 101, 103, 104, 264, 266, 270,
       272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284,
       285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 298,
       299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 310, 311, 313,
       314, 315, 318, 320, 321, 327])

In [97]:
df["Iteraciones 30 dias"].idxmax()

np.int64(3499)

In [None]:
df

Unnamed: 0,Issue ID,Project ID,Classification,Screenshot,Urgency,Input Date,Deadline Theor,Deadline Real,Employee ID,Device,Browser,Page,Contact ID,Request,Iteraciones,Iteraciones 30 dias
3855,IPC100101,PC1001,Design issues,0,,2015-09-05,,2015-09-06,E2,Mobile,Mozilla,True,COC1001,The spacing around section titles [645] should...,5,34
622,IPC1001010,PC1001,Copy issues,0,,2015-02-06,,2015-02-08,E1,Desktop,Safari,True,COC1001,Please rewrite for clarity the CTA button labe...,5,53
4500,IPC10010100,PC1001,New item,0,,2014-09-14,,2014-09-16,E3,Desktop,Chrome,True,COC1001,"Please add a FAQs segmented by user type, to m...",4,7
4555,IPC1001011,PC1001,New item,0,,2015-06-14,,2015-06-17,E3,Desktop,Chrome,True,COC1001,Please add a customizable data visualization b...,5,37
620,IPC1001012,PC1001,Copy issues,1,,2015-05-29,,2015-06-01,E1,Desktop,Safari,True,COC1001,Please emphasize benefits over features the Ca...,3,59
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87,IPC909095,PC909,Bug fix,0,,2014-01-05,,2014-01-08,E3,Mobile,Safari,True,COC901,The search field behavior displays wrong font ...,7,35
4138,IPC909096,PC909,Design issues,0,,2014-01-01,,2014-01-02,E2,Mobile,Chrome,True,COC901,The contrast levels in the hero section [344] ...,5,23
834,IPC909097,PC909,Copy issues,0,,2013-04-22,,2013-04-25,E1,Mobile,Safari,True,COC901,Please remove buzzwords from the ‘Contact Us’ ...,3,76
4112,IPC909098,PC909,Design issues,0,,2013-04-15,,2013-04-16,E2,Desktop,Chrome,True,COC901,The color palette [3] feels too heavy visually.,3,70


In [98]:
df.to_csv("ISSUES_Iteraciones.csv", index = False)