In [3]:
import os
import pygal
import numpy as np
import pandas as pd
from scipy.stats import ks_2samp
from scipy.stats import chisquare
from datetime import datetime, timedelta
%matplotlib inline
pd.set_option('display.float_format', lambda x: '%.2f' % x)

ModuleNotFoundError: No module named 'pygal'

In [32]:
def plot_barstack(data):
    values = data.value_counts(True).sort_index().reset_index().values
    barstack_chart = pygal.StackedBar()
    barstack_chart.title = 'Categorical variable distribution'
    for category, freq in values:
        barstack_chart.add(category, freq)
    return barstack_chart

def plot_spc(df, date_col, cont_col, freq="month", n_sigmas = 3):
    aux = df[[date_col, cont_col]].copy()
    if freq == "month":
        date_format = "%Y-%m-01"
    elif freq == "day":
        date_format = "%Y-%m-%d"
    elif freq == "year":
        date_format = "%Y-01-01"
    aux[freq] = aux[date_col].dt.strftime(date_format)
    aux = aux[[freq, cont_col]].groupby(freq).mean()
    aux[u"μ"] = aux[cont_col].mean()
    aux[f"μ-{n_sigmas}σ"] = aux[cont_col].mean() - n_sigmas*aux[cont_col].std()
    aux[f"μ+{n_sigmas}σ"] = aux[cont_col].mean() + n_sigmas*aux[cont_col].std()
    line_chart = pygal.Line()
    line_chart.title = 'Statistical Control Process'
    line_chart.x_labels = aux.index
    for col in aux.columns:
        line_chart.add(col, aux[col])
    return line_chart, aux[u"μ"].min(), aux[f"μ-{n_sigmas}σ"].min(), aux[f"μ+{n_sigmas}σ"].min()

def plot_histogram(data, n_bins = 10):
    freq, bins = np.histogram(data, bins=n_bins)
    hist = pygal.Histogram()
    hist.add('Wide bars', list(zip(freq, bins, bins[1:])))
    return hist

In [3]:
train = pd.read_csv("/home/oscar/Desktop/Examen 1/FB_data_train.csv")
test = pd.read_csv("/home/oscar/Desktop/Examen 1/FB_data_test.csv")

In [4]:
train["Fecha"] = pd.to_datetime(train["Fecha"])
test["Fecha"] = pd.to_datetime(test["Fecha"])

In [5]:
ls_cont = ["Cierre", "Apertura", "Máximo", "Mínimo"]
ls_disc = ["Quarter", "Weekday"]

In [6]:
def feature_building(df):
    df["Rango"] = df["Máximo"] - df["Mínimo"]
    df["Intervalo"] = df["Cierre"] - df["Apertura"]
    df["RatioCA"] = df["Cierre"] / df["Apertura"]
    return df

In [7]:
train = feature_building(train)

In [8]:
test = feature_building(test)

In [9]:
def build_temporal_features(df):
    df["dia"] = df["Fecha"].dt.strftime("%Y-%m-%d")
    df["mes"] = df["Fecha"].dt.strftime("%Y-%m-01")
    df["anio"] = df["Fecha"].dt.strftime("%Y-01-01")
    return df

In [10]:
train = build_temporal_features(train)

In [11]:
test = build_temporal_features(test)

In [12]:
ls_disc += ["dia", "mes", "anio"]

In [39]:
for feat in ls_disc:
    train[feat] = train[feat].astype(str)
    test[feat] = test[feat].astype(str)

In [13]:
def generate_agg(df):
    agg1 = df[["dia", "Cierre"]].groupby("dia").agg(["min", "max", "mean"]).reset_index()
    agg2 = df[["mes", "Apertura"]].groupby("mes").agg(["min", "max", "mean"]).reset_index()
    return agg1, agg2

In [14]:
agg1, agg2 = generate_agg(train)
agg1_test, agg2_test = generate_agg(test)

In [15]:
cs = pd.DataFrame(columns=["p-value", "¿Son iguales?"])
for x in ls_cont:
    cs.loc[x, "p-value"] = ks_2samp(train[x], test[x]).pvalue
cs["¿Son iguales?"] = cs["p-value"].map(lambda x: "No" if x<0.05 else "Si")

In [16]:
cs

Unnamed: 0,p-value,¿Son iguales?
Cierre,0.0,No
Apertura,0.0,No
Máximo,0.0,No
Mínimo,0.0,No


In [17]:
cs["sets"] = "train/test"

In [20]:
ds = pd.DataFrame(columns=["p-value", "¿Son iguales?"])
for x in ls_disc:
    try:
        ds.loc[x, "p-value"] = chisquare(f_obs=test[x].value_counts(True).sort_index().values, f_exp=train[x].value_counts(True).sort_index().values).pvalue
    except:
        print(f"Error en la prueba de la variable {x}")
        continue
ds["¿Son iguales?"] = ds["p-value"].map(lambda x: "Si" if 1-x<0.05 else "No")

Error en la prueba de la variable dia
Error en la prueba de la variable mes
Error en la prueba de la variable anio


In [19]:
ds

Unnamed: 0,p-value,¿Son iguales?
Quarter,1.0,
Weekday,1.0,


In [27]:
path_to_save = "/home/oscar/Documents/Diplomado Ciencia de Datos/Módulo 1/Examen 1/Oscar Acosta"

In [58]:
for feat in ls_cont:
    plot_histogram(train[feat]).render_to_png(os.path.join(path_to_save, f"continuas/histogramas/{feat}_histograma_train.png"))
    plot_histogram(test[feat]).render_to_png(os.path.join(path_to_save, f"continuas/histogramas/{feat}_histograma_test.png"))
    plot, mean, mmstd, mpstd = plot_spc(cont_col=feat, date_col="Fecha", df=train, freq="month", n_sigmas=1)
    plot.render_to_png(os.path.join(path_to_save, f"continuas/SPC/{feat}_spc_train.png"))
    agg = test[[feat, "mes"]].groupby("mes").mean()
    if sum(agg[feat] > mpstd) > 1:
        print(f"Datos anómalos por encima del límite definido en la variable {feat}")
    if sum(agg[feat] < mmstd) > 1:
        print(f"Datos anómalos por debajo del límite definido en la variable {feat}")

Datos anómalos por encima del límite definido en la variable Cierre
Datos anómalos por encima del límite definido en la variable Apertura
Datos anómalos por encima del límite definido en la variable Máximo
Datos anómalos por encima del límite definido en la variable Mínimo


In [40]:
for feat in ls_disc:
    plot_barstack(train[feat]).render_to_png(os.path.join(path_to_save, f"discretas/{feat}_barstack_train.png"))
    plot_barstack(test[feat]).render_to_png(os.path.join(path_to_save, f"discretas/{feat}_barstack_test.png"))