In [None]:
import os
import findspark
findspark.init()
from dotenv import load_dotenv
load_dotenv()


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import pyspark.pandas as ps
import pandas as pd
#import mlflow

from warnings import filterwarnings
filterwarnings("ignore")

In [None]:
spark = SparkSession.builder.appName("Analytic Contracts App").getOrCreate()

In [None]:
spark

In [None]:
#mlflow.set_experiment("test-experiment")
#mlflow.autolog()

In [None]:
# read data
df_spark = spark.read.options(header='True', inferSchema='True').csv(os.getenv("dataPath"))

In [None]:
df_spark.head(5)

In [None]:
df_spark.tail(5)

In [None]:
df_spark.describe().show()

In [None]:
df_spark.printSchema()

In [None]:
df_spark.dtypes

In [None]:
# total columns
print("Total columns are:", len(df_spark.columns), "columns")

In [None]:
# select all null values
df_spark.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df_spark.columns]).show()

In [None]:

print("unique values:", len(df_spark.select("Nombre del Procedimiento").distinct().collect()))

In [None]:
df_spark.select("Nombre del Procedimiento").distinct().show()

In [None]:
for col in df_spark.columns:
    mode_colum = df_spark.groupby(col).count().orderBy("count", ascending=False).first()[0]
    print(f"The mode for column {col} is: {mode_colum}")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
base_price = df_spark.select(df_spark["Precio Base"].cast(IntegerType()).alias('base_price'))
base_price.show(5)

In [None]:
figure = plt.figure(figsize=(10, 6))
plt.hist(base_price.collect(), bins=20, edgecolor="black")
plt.title("Distribution of Base Prices")
plt.xlabel("Base Price")
plt.ylabel("Frequency")
plt.grid(True)
plt.show()

In [None]:
# plot distribution for entity
import numpy as np
figure2 = plt.figure(figsize=(8, 6))
plt.title('Entity Distribution')
plt.xlabel('Entity')
plt.ylabel('Frequency')
plt.bar(df_spark.select("Entidad").collect(), bins=20, edgecolor='black')
display(figure2)

In [None]:
ps_df = df_spark.pandas_api()

In [None]:
ps_df.head(5)

In [None]:
ps_df.tail()

In [None]:
ps_df["Precio Base"].plot.line()

In [None]:
ps_df["Duracion"].astype("int16").plot.bar()

In [None]:
ps_df.groupby(["Ciudad Entidad"])["Proveedores Invitados"].mean()


In [None]:
ps_df.groupby(["Ciudad Entidad"])["Estado del Procedimiento"].count().sort_values(ascending=False)