## Pandas
https://www.geeksforgeeks.org/pandas/pandas-tutorial/
https://www.w3schools.com/python/pandas/default.asp

Pandas is a Python library used for working with data sets.

It has functions for analyzing, cleaning, exploring, and manipulating data.

Se necesita un DataFrame: *DataFrames: It is a two-dimensional data structure constructed with rows and columns, which is more similar to Excel spreadsheet.*

### Series

In [4]:
import pandas as pd

serie = [1, 7, 2]
## adding label (index) to serie
serie_labeled = pd.Series(serie, index = ["x", "y", "z"])
print(serie_labeled)
## Access by label
print(serie_labeled.x)



# Key / value (python dictionary) as series --> Keys of dictionary become labels
calories = {"day1": 420, "day2": 380, "day3": 390}
calories_serie = pd.Series(calories)
print(calories_serie)



ModuleNotFoundError: No module named 'pandas'

### Data Frames

In [None]:
cars_dataset = {
  'cars': ["BMW", "Volvo", "Ford"],
  'passings': [3, 7, 2]
}

cars_dataframe = pd.DataFrame(cars_dataset)

print(cars_dataframe)


calories_dataset = {
  "day": ["Mon", "Tue", "Wed"],
  "calories": [420, 380, 390],
  "duration": [50, 40, 45]
}
calories_dataframe = pd.DataFrame(calories_dataset)

print(calories_dataframe)

##Access to row
print(f"DataFrame row 1")
print(f"{calories_dataframe.loc[1]}")

##Access to range of rows
print(f"DataFrame row 0 to 1")
print(f"{calories_dataframe.loc[0:1]}")

## Add index to dataframe
calories_dataframe = pd.DataFrame(calories_dataset, index = ["day1", "day2", "day3"])
print(f"{calories_dataframe}")

#refer to the named index:
print(calories_dataframe.loc["day2"])


#### CSV To Pandas

In [None]:
ucl_dataframe = pd.read_csv('la_liga_standings.csv')
print(ucl_dataframe.to_string())

# TOP 5
print(ucl_dataframe.head())

# TOP 2
print(ucl_dataframe.head(2))

# Last 3
print(ucl_dataframe.tail(3))

# info
print(ucl_dataframe.info()) 

#### JSON To DataFrame

In [None]:
ucl_dataframe = pd.read_json('la_liga_2324.json')
print(ucl_dataframe.to_string())

### Data processing with Data Frames

In [None]:
from numpy import fix
from pandas import DataFrame


calories_dataframe_original = pd.read_csv('calories_dataset.csv')
print(calories_dataframe_original.info())
# print(calories_dataframe.to_string())
# print(f"has duplicate {calories_dataframe_original.duplicated()}")

##Delete duplicates
calories_dataframe = calories_dataframe_original.drop_duplicates()
# print(calories_dataframe.to_string())
## Delete NA
# calories_dataframe = calories_dataframe.dropna()
## Replaces NA by a value --> Not usefull in our case
# calories_dataframe = calories_dataframe.fillna(0)
## Replaces NA by a value --> only for a column or a set of columns
calories_dataframe = calories_dataframe.fillna({"Calories": 0})
# Drop de rest of NA rows
calories_dataframe = calories_dataframe.dropna()

def print_dataframe_stats(dataframe, field):
    print("*************Stats*************")
    print(f"Stats for {field}")
    stats = {
        "mean": dataframe[field].mean(),
        "median": dataframe[field].median(),
        "max": dataframe[field].max(),
        "min": dataframe[field].min(),
        "mode": dataframe[field].mode()
    }
    stats_dataframe = pd.DataFrame(stats)
    print(stats_dataframe)
    print("*************END: Stats*************")


print_dataframe_stats(calories_dataframe, "Calories")
print_dataframe_stats(calories_dataframe, "Pulse")
print_dataframe_stats(calories_dataframe, "Maxpulse")


# print(calories_dataframe_original.loc[10:14])
# print(calories_dataframe.loc[10:14])


# fix wrong column formats
calories_dataframe.Date = pd.to_datetime(calories_dataframe['Date'], format='mixed')
# print(calories_dataframe.to_string())

#Considering Row 7 / Duration is too high, we assume data is wrong. 
# One way to fix it is by replicing the value
# Another way is to removing the value
## Direct replacement if we now which is the wrong row
# calories_dataframe.loc[7, 'Duration'] = 45

# calories_dataframe.index contains the index of the row, so it is a numeric value
# Iterate over rows to fix by replacement the ones which are wrong
for row_index in calories_dataframe.index:
    value: int = calories_dataframe.loc[row_index, "Duration"]
    if  value > 300:
        ##Value to high --> setting the limit to 300 minutes
        calories_dataframe.loc[row_index, "Duration"] = 300
    if  value < 0:
        ##This value is negative and it is for Duration --> remove it. No workouts of 0 minutes or less
        calories_dataframe.drop(index=row_index, inplace=True)

print(calories_dataframe.to_string())

### Correlation
Calcula las relaciones entre las columnas del data set --> busca relaciones es inteligente

The number varies from -1 to 1.

1 means that there is a 1 to 1 relationship (a perfect correlation), and for this data set, each time a value went up in the first column, the other one went up as well.

0.9 is also a good relationship, and if you increase one value, the other will probably increase as well.

-0.9 would be just as good relationship as 0.9, but if you increase one value, the other will probably go down.

0.2 means NOT a good relationship, meaning that if one value goes up does not mean that the other will.

**Valores positivos** *--> si se incrementa una de las columnas, la otra incrementara tambien*
**Valores Negativos** *--> si se incrementa una de las columnas, la otra DECREMENTARÁ*
**Se considera buena correlacion 0.6 o -0.6**



In [None]:



from pprint import pprint


def obtain_highest_correlancy(dataframe, limit:float):
    correlation_df = dataframe.corr()
    high_correlancy = {}
    for index in correlation_df.index:
        row = correlation_df.loc[index]
        for row_index in row.index:
            val = float(row[row_index])
            if val == 1.0:
                x=1
            else:
                if val > limit:
                    high_correlancy.setdefault(index, {'column': index, 'row': row_index, 'corr': val})
                if val < -limit:
                    high_correlancy.setdefault(index, {'column': index, 'row': row_index, 'corr': val})
    return high_correlancy

original_large_calories_dataframe = pd.read_csv('calories_large_dataset.csv')

working_large_calories_dataframe = original_large_calories_dataframe.drop_duplicates()
working_large_calories_dataframe.fillna({"Calories": 0})

print_dataframe_stats(calories_dataframe, "Calories")
print_dataframe_stats(calories_dataframe, "Pulse")
print_dataframe_stats(calories_dataframe, "Maxpulse")
print_dataframe_stats(calories_dataframe, "Duration")

# Data is clean, it doesn't contain any deviated value --> Calculate correlation bigger than 0.7
print("Correlation")
pprint(obtain_highest_correlancy(dataframe=working_large_calories_dataframe, limit=0.7))


## Pandas PLOT

Para printar graficos

import pandas as pd
import matplotlib.pyplot as plt

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

In [None]:

## por defecto
working_large_calories_dataframe.plot()
plt.show()

In [None]:
kind="scatter"
# (function) kind: Literal['line', 'bar', 'barh', 'hist', 'box', 'kde', 'density', 'area', 'pie', 'scatter', 'hexbin']


working_large_calories_dataframe.plot(kind=kind, x='Duration', y ='Calories')
## Correlation es alta entre Duration y Calories, a mas duracion, mas calorias gastas
print("More Duration of workout, more calories burned. High correlation")
plt.show()

working_large_calories_dataframe.plot(kind=kind, x='Duration', y ='Pulse')
print("Bad correlation. Both columns aren't related.")
print("Observations: The longest activity 300 minutes was light cardio: 110ppm")
print("Observations: Most of the workouts are 60 minutes long")
print("Observations: The hardest activity 160ppm minutes was shorter than 50 minutes. ")

plt.show()

In [None]:
# Histogram

print("Workouts more frecuents are 60 minutes long. And then, less than 60 50 minutes long")
working_large_calories_dataframe["Duration"].plot(kind = 'hist')
plt.show()

print("Workouts more frecuents are low cardio. 110ppm or less")
working_large_calories_dataframe["Pulse"].plot(kind = 'hist')
plt.show()


### Procesando JSON nesteados

Ejemplos de API UEFA --> UCL Teams 2025 from JSON

In [None]:
import json
import fnmatch

import matplotlib.pyplot as plt
import numpy as np

with open('ucl_2025_teams.json', 'r') as file:
    json_data = json.load(file)

ucl_dataframe = pd.json_normalize(json_data)

ucl_dataframe['associationId'] = pd.to_numeric(ucl_dataframe['associationId'])
ucl_dataframe['id'] = pd.to_numeric(ucl_dataframe['id'])
ucl_dataframe['organizationId'] = pd.to_numeric(ucl_dataframe['organizationId'])

def remove_columns(starts_with_remove, ends_with_exception, dataframe, inplace):
    columnsToDelete = []
    for column in dataframe.columns:
        if fnmatch.fnmatch(column, starts_with_remove) and not column.endswith(ends_with_exception):
        # if column.startswith(starts_with_remove) and not column.endswith(ends_with_exception):
            columnsToDelete.append(column)
    dataframe.drop(columns=columnsToDelete, inplace=inplace)

remove_columns(starts_with_remove="*translations", ends_with_exception="EN", dataframe=ucl_dataframe, inplace=True)


frequencies = ucl_dataframe['translations.countryName.EN'].value_counts()
frequencies.plot(kind='bar', figsize=(10, 6), color='red')

# Agregar etiquetas y título
plt.title('Teams by Country', fontsize=16)
plt.xlabel('Country', fontsize=14)
plt.ylabel('Number of teams', fontsize=14)
plt.show()

### UEFA UCL 2025 Standings from API

In [None]:
import requests
import datetime
url = "https://standings.uefa.com/v1/standings?competitionId=1&seasonYear=2025"


headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
}
response = requests.get(url, headers=headers)
data = {}

if response.status_code == 200:
    data = response.json()
else:
    print(f"Error en la petición: {response.status_code}")

standings_dataframe = pd.json_normalize(data)

remove_columns(starts_with_remove="*.translations.*", ends_with_exception="EN", dataframe=standings_dataframe, inplace=True)
standings_dataframe.dropna(inplace=True)
print(standings_dataframe.info())
