## Importación y descripción

In [1]:
import pandas as pd

Un DataFrame es una estructura tabular compuesta por series. Se trata del tipo de datos fundamental en pandas y sobre el que giran la mayoría de operaciones que podemos realizar.

Lo más habitual cuando se trabaja en ciencia de datos es tener la información en distintas fuentes auxiliares: bases de datos, ficheros, llamadas remotas a APIs, etc. Pandas nos ofrece una variedad enorme de funciones para cargar datos desde, prácticamente, cualquier origen.

In [2]:
nba = pd.read_csv("nba.csv") #importamos y cargamos el dataframe
nba.head(7)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0
5,Amir Johnson,Boston Celtics,90.0,PF,29.0,6-9,240.0,,12000000.0
6,Jordan Mickey,Boston Celtics,55.0,PF,21.0,6-8,235.0,LSU,1170960.0


In [6]:
# Si estan en colab
#from google.colab import files
#uploaded = files.upload()

In [3]:
nba.shape  #filas y columnas

(458, 9)

In [4]:
nba.dtypes #tipo de datos

Name         object
Team         object
Number      float64
Position     object
Age         float64
Height       object
Weight      float64
College      object
Salary      float64
dtype: object

In [5]:
#Cuantas columnas tienen segun el tipo de datos
nba.dtypes.value_counts()

object     5
float64    4
Name: count, dtype: int64

In [6]:
# columnas deldataframe
nba.columns 

Index(['Name', 'Team', 'Number', 'Position', 'Age', 'Height', 'Weight',
       'College', 'Salary'],
      dtype='object')

In [7]:
nba.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458 entries, 0 to 457
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      457 non-null    object 
 1   Team      457 non-null    object 
 2   Number    457 non-null    float64
 3   Position  457 non-null    object 
 4   Age       457 non-null    float64
 5   Height    457 non-null    object 
 6   Weight    457 non-null    float64
 7   College   373 non-null    object 
 8   Salary    446 non-null    float64
dtypes: float64(4), object(5)
memory usage: 32.3+ KB


In [8]:
round(nba.describe(), 2)

Unnamed: 0,Number,Age,Weight,Salary
count,457.0,457.0,457.0,446.0
mean,17.68,26.94,221.52,4842684.11
std,15.97,4.4,26.37,5229237.6
min,0.0,19.0,161.0,30888.0
25%,5.0,24.0,200.0,1044792.25
50%,13.0,26.0,220.0,2839073.0
75%,25.0,30.0,240.0,6500000.0
max,99.0,40.0,307.0,25000000.0


In [9]:
nba.duplicated().sum() #chequear los duplicados

0

In [10]:
nba.isnull().sum() #chequear valores nulos

Name         1
Team         1
Number       1
Position     1
Age          1
Height       1
Weight       1
College     85
Salary      12
dtype: int64

## Transformación de tipos de datos

In [12]:
sales = pd.read_excel("sales.xlsx", usecols = [1,2,3,4,5,6])
sales.head()

Unnamed: 0,first_name,last_name,gender,date,local_num,city
0,Belita,Egdell,Female,13/09/2020,2.54,Åtvidaberg
1,Roseline,Kalinsky,Bigender,30/07/2020,83.85,Carbajales
2,Pamela,Pippard,Polygender,14/01/2021,25.04,Starominskaya
3,Lenard,Skones,Polygender,26/12/2020,38.72,Sabangan
4,Verne,Shackell,Polygender,12/10/2020,47.91,Cawayan Bugtong


In [13]:
sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   first_name  1000 non-null   object 
 1   last_name   1000 non-null   object 
 2   gender      1000 non-null   object 
 3   date        1000 non-null   object 
 4   local_num   1000 non-null   float64
 5   city        1000 non-null   object 
dtypes: float64(1), object(5)
memory usage: 47.0+ KB


In [14]:
sales["local_num"] = sales["local_num"].astype(float)

In [15]:
sales.head()

Unnamed: 0,first_name,last_name,gender,date,local_num,city
0,Belita,Egdell,Female,13/09/2020,2.54,Åtvidaberg
1,Roseline,Kalinsky,Bigender,30/07/2020,83.85,Carbajales
2,Pamela,Pippard,Polygender,14/01/2021,25.04,Starominskaya
3,Lenard,Skones,Polygender,26/12/2020,38.72,Sabangan
4,Verne,Shackell,Polygender,12/10/2020,47.91,Cawayan Bugtong


In [16]:
sales["gender"] = sales["gender"].astype('category')

In [18]:
sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   first_name  1000 non-null   object  
 1   last_name   1000 non-null   object  
 2   gender      1000 non-null   category
 3   date        1000 non-null   object  
 4   local_num   1000 non-null   float64 
 5   city        1000 non-null   object  
dtypes: category(1), float64(1), object(4)
memory usage: 40.5+ KB


In [19]:
sales.sort_values("city", ascending=True).head()

Unnamed: 0,first_name,last_name,gender,date,local_num,city
216,Emanuel,Bridgeland,Agender,27/01/2021,74.46,A-da-Gorda
543,Heidie,Rousby,Genderfluid,11/07/2020,65.44,Aasiaat
181,Tonnie,Bunworth,Agender,19/10/2020,70.84,Abdurahmoni Jomí
692,Barth,Stagge,Genderfluid,25/01/2021,11.46,Abelheira
314,Janette,Thwaites,Male,13/09/2020,98.07,Abelheira


In [20]:
sales.sort_values(["gender", "local_num"]).head(15)

Unnamed: 0,first_name,last_name,gender,date,local_num,city
469,Marney,Keaveney,Agender,07/08/2020,1.86,Choma
517,Anni,Madison,Agender,10/05/2020,3.09,Haveluloto
733,Bartholomeus,Baglow,Agender,08/04/2020,3.52,Kalidawir
244,Larisa,McGrowther,Agender,01/05/2020,3.53,Tonoas Municipal Building
77,Barrett,Perl,Agender,03/10/2020,4.08,Taishanmiao
738,Erskine,Godleman,Agender,16/08/2020,4.65,Kościerzyna
912,Grady,Baggaley,Agender,09/09/2020,4.74,Qozonketkan
930,Harry,Hamby,Agender,27/03/2021,5.62,Homa Bay
318,Dorthy,Matejovsky,Agender,27/07/2020,10.35,Monteros
545,Marylou,Spearman,Agender,25/03/2021,10.45,Shangcheng Chengguanzhen


In [21]:
sales["gender"].value_counts()

gender
Bigender       143
Male           142
Non-binary     140
Genderfluid    130
Genderqueer    115
Agender        113
Female         112
Polygender     105
Name: count, dtype: int64

## Fechas

In [22]:
import datetime as dt

In [23]:
mi_ejemplo = pd.to_datetime('2022-12-09 14:20:35', yearfirst=True)

# Imprimir los componentes de la fecha
print("Año:", mi_ejemplo.year)
print("Mes:", mi_ejemplo.month)
print("Día:", mi_ejemplo.day)
print("Hora:", mi_ejemplo.hour)
print("Minuto:", mi_ejemplo.minute)
print("Segundo:", mi_ejemplo.second)
print("Día del año:", mi_ejemplo.dayofyear)

Año: 2022
Mes: 12
Día: 9
Hora: 14
Minuto: 20
Segundo: 35
Día del año: 343


In [29]:
print(pd.Timestamp("2021-4-14"))
print(pd.Timestamp("2021/4/14"))
print(pd.Timestamp("2021, 4, 14"))
print(pd.Timestamp("14/4/2021"))
print(pd.Timestamp("2021-4-14 20:30:45"))
print(pd.Timestamp("2021/4/14 8:30:45 PM"))

2021-04-14 00:00:00
2021-04-14 00:00:00


TypeError: __new__() got an unexpected keyword argument 'format'

In [30]:
print(pd.to_datetime("2014-5-15"))
print(pd.to_datetime(["2021/4/14", "2014/08/14", "2018", "December 20th, 1991"]))

ValueError: time data "2014-5-15" doesn't match format "YYYY-MM-DD", at position 0. You might want to try:
    - passing `format` if your strings have a consistent format;
    - passing `format='ISO8601'` if your strings are all ISO8601 but not necessarily in exactly the same format;
    - passing `format='mixed'`, and the format will be inferred for each element individually. You might want to use `dayfirst` alongside this.

In [26]:
times = pd.Series(["2021/4/14", "2014/08/14", "2018", "December 20th, 1991"])
times

0              2021/4/14
1             2014/08/14
2                   2018
3    December 20th, 1991
dtype: object

In [27]:
pd.to_datetime(times)

ValueError: time data "2018" doesn't match format "%Y/%m/%d", at position 2. You might want to try:
    - passing `format` if your strings have a consistent format;
    - passing `format='ISO8601'` if your strings are all ISO8601 but not necessarily in exactly the same format;
    - passing `format='mixed'`, and the format will be inferred for each element individually. You might want to use `dayfirst` alongside this.

In [None]:
sales["date"] = pd.to_datetime(sales["date"])
sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   first_name  1000 non-null   object        
 1   last_name   1000 non-null   object        
 2   gender      1000 non-null   category      
 3   date        1000 non-null   datetime64[ns]
 4   local_num   1000 non-null   float64       
 5   city        1000 non-null   object        
dtypes: category(1), datetime64[ns](1), float64(1), object(3)
memory usage: 40.5+ KB


  sales["date"] = pd.to_datetime(sales["date"])


In [None]:
sales.head()

Unnamed: 0,first_name,last_name,gender,date,local_num,city
0,Belita,Egdell,Female,2020-09-13,2.54,Åtvidaberg
1,Roseline,Kalinsky,Bigender,2020-07-30,83.85,Carbajales
2,Pamela,Pippard,Polygender,2021-01-14,25.04,Starominskaya
3,Lenard,Skones,Polygender,2020-12-26,38.72,Sabangan
4,Verne,Shackell,Polygender,2020-12-10,47.91,Cawayan Bugtong


In [None]:
pd.read_excel("sales.xlsx", 
              parse_dates = ["date"]).info()

  pd.read_excel("sales.xlsx",


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   id          1000 non-null   int64         
 1   first_name  1000 non-null   object        
 2   last_name   1000 non-null   object        
 3   gender      1000 non-null   object        
 4   date        1000 non-null   datetime64[ns]
 5   local_num   1000 non-null   float64       
 6   city        1000 non-null   object        
dtypes: datetime64[ns](1), float64(1), int64(1), object(4)
memory usage: 54.8+ KB


In [None]:
x = pd.to_datetime("2021-12-10")
y = pd.to_datetime("2021-12-12")

y - x

Timedelta('2 days 00:00:00')