Pandas has two primary data structures: Series and DataFrame. 

In [4]:
import numpy as np
import pandas as pd



In [3]:
#Create DataFrame from Dictionary

d= {'col1':[1,2], 'col2':[3,4]}
df = pd.DataFrame(data=d)
df



Unnamed: 0,col1,col2
0,1,3
1,2,4


In [6]:
#Create dataFrame from numpy array
df2 = pd.DataFrame(np.array([[1,2,3],[4,5,6], [7,8,9]]),
                   columns=['a','b','c'])
df2

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6
2,7,8,9


In [None]:
# From CSV

f3 = pd.read_csv('/file_path/file_name.csv')


In [12]:
#columns Returns the column labels of the dataframe
df = pd.DataFrame({'A': [1, 2], 'B': [3, 4]})
 
print (len(df.columns))
print(df.columns[0])
print(df.columns[1])

 

2
A
B


In [16]:
# dtypes returns data types in the frame

df = pd.DataFrame({'float': [1.0],
                   'int': [1],
                   'datetime': [pd.Timestamp('20180310')],
                   'string': ['foo']})
df
 

Unnamed: 0,float,int,datetime,string
0,1.0,1,2018-03-10,foo


In [17]:
 
df.dtypes

float              float64
int                  int64
datetime    datetime64[ns]
string              object
dtype: object

In [18]:
# iloc() Accesses a group of rows and columns using integer-based indexing
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.iloc.html#pandas.DataFrame.iloc
# .iloc[] is primarily integer position based 
#(from 0 to length-1 of the axis), but may also be used with a boolean array.


mydict = [{'a': 1, 'b': 2, 'c': 3, 'd': 4},
          {'a': 100, 'b': 200, 'c': 300, 'd': 400},
          {'a': 1000, 'b': 2000, 'c': 3000, 'd': 4000}]
df = pd.DataFrame(mydict)
df


Unnamed: 0,a,b,c,d
0,1,2,3,4
1,100,200,300,400
2,1000,2000,3000,4000


In [23]:
type(df.iloc[0]) #indexing just the rows with a scalar integer
 
df.iloc[0]

a    1
b    2
c    3
d    4
Name: 0, dtype: int64

In [22]:
df.iloc[[0]] # with a list of integers


Unnamed: 0,a,b,c,d
0,1,2,3,4


In [24]:
#With a slice object
df.iloc[:3]

Unnamed: 0,a,b,c,d
0,1,2,3,4
1,100,200,300,400
2,1000,2000,3000,4000


In [25]:
# With a boolean mask the same length as the index.
df.iloc[[True, False, True]]

Unnamed: 0,a,b,c,d
0,1,2,3,4
2,1000,2000,3000,4000


Unnamed: 0,a,b,c,d
0,1,2,3,4
2,1000,2000,3000,4000


In [27]:
df.iloc[lambda x: x.index % 2 == 0]

Unnamed: 0,a,b,c,d
0,1,2,3,4
2,1000,2000,3000,4000


In [28]:
df = pd.DataFrame([[1, 2], [4, 5], [7, 8]],
                  index=['cobra', 'viper', 'sidewinder'],
                  columns=['max_speed', 'shield'])
df

Unnamed: 0,max_speed,shield
cobra,1,2
viper,4,5
sidewinder,7,8


In [29]:
df.loc['viper']

max_speed    4
shield       5
Name: viper, dtype: int64

In [30]:
df.loc[['viper', 'sidewinder']]

Unnamed: 0,max_speed,shield
viper,4,5
sidewinder,7,8


In [31]:
df.loc['cobra', 'shield']

np.int64(2)

In [32]:
df.loc['cobra':'viper', 'max_speed']


cobra    1
viper    4
Name: max_speed, dtype: int64

In [33]:
Boolean list with the same length as the row axis
df.loc[[False, False, True]]


Unnamed: 0,max_speed,shield
sidewinder,7,8


In [34]:
>>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
>>> df.shape
 

(2, 2)

In [36]:
df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4],
                   'col3': [5, 6]})
df.shape

(2, 3)

In [37]:
df = pd.DataFrame({'age':    [ 3,  29],
                   'height': [94, 170],
                   'weight': [31, 115]})
df

Unnamed: 0,age,height,weight
0,3,94,31
1,29,170,115


In [38]:
df.values

array([[  3,  94,  31],
       [ 29, 170, 115]])

In [39]:
pd.DataFrame({"A": [1, 2], "B": [3, 4]}).to_numpy()

array([[1, 3],
       [2, 4]])

In [41]:
df = pd.DataFrame([[4, 9]] * 3, columns=['A', 'B'])
df

Unnamed: 0,A,B
0,4,9
1,4,9
2,4,9


In [41]:
df = pd.DataFrame([[4, 9]] * 3, columns=['A', 'B'])
df

Unnamed: 0,A,B
0,4,9
1,4,9
2,4,9


In [42]:
df.apply(np.sqrt)

Unnamed: 0,A,B
0,2.0,3.0
1,2.0,3.0
2,2.0,3.0


In [43]:
df.apply(np.sum, axis=0)

A    12
B    27
dtype: int64

In [5]:
s = pd.Series([1, 2], index=["a", "b"])
s



a    1
b    2
dtype: int64

In [6]:
s_copy = s.copy()
s_copy

a    1
b    2
dtype: int64

Shallow copy shares data and index with original.
Deep copy has own copy of data and index.



In [8]:
#Describe
s = pd.Series([1, 2, 3])
s.describe()


count    3.0
mean     2.0
std      1.0
min      1.0
25%      1.5
50%      2.0
75%      2.5
max      3.0
dtype: float64

In [13]:
#Drop specified labels from rows or columns
df = pd.DataFrame(np.arange(12).reshape(3, 4),
                  columns=['A', 'B', 'C', 'D'])
df

Unnamed: 0,A,B,C,D
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11


In [11]:
df.drop(['B', 'C'], axis=1)

Unnamed: 0,A,D
0,0,3
1,4,7
2,8,11


In [14]:
df.drop(columns=['B', 'C'])

Unnamed: 0,A,D
0,0,3
1,4,7
2,8,11


In [15]:
df.drop([0, 1])

Unnamed: 0,A,B,C,D
2,8,9,10,11


In [16]:
df


Unnamed: 0,A,B,C,D
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11


In [None]:
group By


In [17]:
df = pd.DataFrame({'Animal': ['Falcon', 'Falcon',
                              'Parrot', 'Parrot'],
                   'Max Speed': [380., 370., 24., 26.]})
df

 

Unnamed: 0,Animal,Max Speed
0,Falcon,380.0
1,Falcon,370.0
2,Parrot,24.0
3,Parrot,26.0


In [18]:
df.groupby(['Animal']).mean()


Unnamed: 0_level_0,Max Speed
Animal,Unnamed: 1_level_1
Falcon,375.0
Parrot,25.0


In [19]:
df = pd.DataFrame({'animal': ['alligator', 'bee', 'falcon', 'lion',
                   'monkey', 'parrot', 'shark', 'whale', 'zebra']})
df

Unnamed: 0,animal
0,alligator
1,bee
2,falcon
3,lion
4,monkey
5,parrot
6,shark
7,whale
8,zebra


In [20]:
df.head()

Unnamed: 0,animal
0,alligator
1,bee
2,falcon
3,lion
4,monkey


In [21]:
df.head(3)

Unnamed: 0,animal
0,alligator
1,bee
2,falcon


In [22]:
int_values = [1, 2, 3, 4, 5]
text_values = ['alpha', 'beta', 'gamma', 'delta', 'epsilon']
float_values = [0.0, 0.25, 0.5, 0.75, 1.0]
df = pd.DataFrame({"int_col": int_values, "text_col": text_values,
                  "float_col": float_values})
df

Unnamed: 0,int_col,text_col,float_col
0,1,alpha,0.0
1,2,beta,0.25
2,3,gamma,0.5
3,4,delta,0.75
4,5,epsilon,1.0


In [23]:
df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   int_col    5 non-null      int64  
 1   text_col   5 non-null      object 
 2   float_col  5 non-null      float64
dtypes: float64(1), int64(1), object(1)
memory usage: 252.0+ bytes


In [None]:
.isna()
Detects missing values

In [24]:
df = pd.DataFrame(dict(age=[5, 6, np.nan],
                       born=[pd.NaT, pd.Timestamp('1939-05-27'),
                             pd.Timestamp('1940-04-25')],
                       name=['Alfred', 'Batman', ''],
                       toy=[None, 'Batmobile', 'Joker']))
df

Unnamed: 0,age,born,name,toy
0,5.0,NaT,Alfred,
1,6.0,1939-05-27,Batman,Batmobile
2,,1940-04-25,,Joker


In [25]:
df.isna()

Unnamed: 0,age,born,name,toy
0,False,True,False,True
1,False,False,False,False
2,True,False,False,False


.sort_values
sorts the values across a given axis


In [27]:
 df = pd.DataFrame({
    'col1': ['A', 'A', 'B', np.nan, 'D', 'C'],
    'col2': [2, 1, 9, 8, 7, 4],
    'col3': [0, 1, 9, 4, 2, 3],
    'col4': ['a', 'B', 'c', 'D', 'e', 'F']
})
df


Unnamed: 0,col1,col2,col3,col4
0,A,2,0,a
1,A,1,1,B
2,B,9,9,c
3,,8,4,D
4,D,7,2,e
5,C,4,3,F


In [28]:
df.sort_values(by=['col1'])

Unnamed: 0,col1,col2,col3,col4
0,A,2,0,a
1,A,1,1,B
2,B,9,9,c
5,C,4,3,F
4,D,7,2,e
3,,8,4,D


In [29]:
df = pd.DataFrame({'num_legs': [2, 4, 4, 6],
                   'num_wings': [2, 0, 0, 0]},
                  index=['falcon', 'dog', 'cat', 'ant'])
df

Unnamed: 0,num_legs,num_wings
falcon,2,2
dog,4,0
cat,4,0
ant,6,0


In [30]:
df.value_counts()

num_legs  num_wings
4         0            2
2         2            1
6         0            1
Name: count, dtype: int64

where() replaces values in the dataframe where a give condition is false


In [32]:
s = pd.Series(range(5))
s

0    0
1    1
2    2
3    3
4    4
dtype: int64

In [33]:
s.where(s > 0)

0    NaN
1    1.0
2    2.0
3    3.0
4    4.0
dtype: float64

In [34]:
 

# Sample DataFrame
data = {
    'A': [1, 2, 3, 4, 5],
    'B': [10, 20, 30, 40, 50],
    'C': [100, 200, 300, 400, 500]
}
df = pd.DataFrame(data)
print(df)


   A   B    C
0  1  10  100
1  2  20  200
2  3  30  300
3  4  40  400
4  5  50  500


In [35]:
# Applying where() function
df['B'] = df['B'].where(df['B'] <= 30, np.nan)
df

Unnamed: 0,A,B,C
0,1,10.0,100
1,2,20.0,200
2,3,30.0,300
3,4,,400
4,5,,500


In [None]:
https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html


In [36]:
df = pd.DataFrame({
   'A': ['alpha', 'apple', 'arsenic', 'angel', 'android'],
   'B': [1, 2, 3, 4, 5],
   'C': ['coconut', 'curse', 'cassava', 'cuckoo', 'clarinet'],
   'D': [6, 7, 8, 9, 10]
   },
   index=['row_0', 'row_1', 'row_2', 'row_3', 'row_4'])
df

Unnamed: 0,A,B,C,D
row_0,alpha,1,coconut,6
row_1,apple,2,curse,7
row_2,arsenic,3,cassava,8
row_3,angel,4,cuckoo,9
row_4,android,5,clarinet,10


In [37]:
print(df.loc['row_1'])

A    apple
B        2
C    curse
D        7
Name: row_1, dtype: object


In [None]:
Inserting just the row index name in selector brackets returns a Series object. Inserting the row index name as a list returns a DataFrame object:


In [38]:
print(df.loc[['row_1']])

           A  B      C  D
row_1  apple  2  curse  7


In [None]:
To select multiple rows by name, use a list within selector brackets:


In [39]:
df = pd.DataFrame({
   'A': ['alpha', 'apple', 'arsenic', 'angel', 'android'],
   'B': [1, 2, 3, 4, 5],
   'C': ['coconut', 'curse', 'cassava', 'cuckoo', 'clarinet'],
   'D': [6, 7, 8, 9, 10]
   },
   index=['row_0', 'row_1', 'row_2', 'row_3', 'row_4'])
df

Unnamed: 0,A,B,C,D
row_0,alpha,1,coconut,6
row_1,apple,2,curse,7
row_2,arsenic,3,cassava,8
row_3,angel,4,cuckoo,9
row_4,android,5,clarinet,10


In [40]:
print(df.loc[['row_2', 'row_4']])

             A  B         C   D
row_2  arsenic  3   cassava   8
row_4  android  5  clarinet  10


In [41]:
print(df.loc['row_0':'row_3'])

             A  B        C  D
row_0    alpha  1  coconut  6
row_1    apple  2    curse  7
row_2  arsenic  3  cassava  8
row_3    angel  4   cuckoo  9


In [None]:
 iloc[] lets you select rows by numeric position, similar to how you would access elements of a list or an array. 

In [43]:
print(df)
print()
print(df.iloc[1])

             A  B         C   D
row_0    alpha  1   coconut   6
row_1    apple  2     curse   7
row_2  arsenic  3   cassava   8
row_3    angel  4    cuckoo   9
row_4  android  5  clarinet  10

A    apple
B        2
C    curse
D        7
Name: row_1, dtype: object


In [44]:
print(df.iloc[[1]])

           A  B      C  D
row_1  apple  2  curse  7


In [45]:
print(df.iloc[[0, 2, 4]])

             A  B         C   D
row_0    alpha  1   coconut   6
row_2  arsenic  3   cassava   8
row_4  android  5  clarinet  10


In [46]:
print(df.iloc[0:3])

             A  B        C  D
row_0    alpha  1  coconut  6
row_1    apple  2    curse  7
row_2  arsenic  3  cassava  8


In [47]:
print(df['C'])


row_0     coconut
row_1       curse
row_2     cassava
row_3      cuckoo
row_4    clarinet
Name: C, dtype: object


In [47]:
print(df['C'])


row_0     coconut
row_1       curse
row_2     cassava
row_3      cuckoo
row_4    clarinet
Name: C, dtype: object


In [48]:
print(df[['A', 'C']])

             A         C
row_0    alpha   coconut
row_1    apple     curse
row_2  arsenic   cassava
row_3    angel    cuckoo
row_4  android  clarinet


In [49]:
print(df.loc[:, ['B', 'D']])

       B   D
row_0  1   6
row_1  2   7
row_2  3   8
row_3  4   9
row_4  5  10


In [50]:
print(df.iloc[:, [1,3]])

       B   D
row_0  1   6
row_1  2   7
row_2  3   8
row_3  4   9
row_4  5  10


In [None]:
Select rows and columns

In [None]:
Both loc[] and iloc[] can be used to select specific rows and columns together. 

In [51]:
print(df.loc['row_0':'row_2', ['A','C']])

             A        C
row_0    alpha  coconut
row_1    apple    curse
row_2  arsenic  cassava


In [52]:
print(df.iloc[[2, 4], 0:3])

             A  B         C
row_2  arsenic  3   cassava
row_4  android  5  clarinet


In [53]:
# This is most convenient for VIEWING: 
print(df.iloc[0:3][['D']])

# But this is best practice/more stable for assignment/manipulation:
print(df.loc[df.index[0:3], 'D'])

       D
row_0  6
row_1  7
row_2  8
row_0    6
row_1    7
row_2    8
Name: D, dtype: int64


In [54]:
df = pd.DataFrame({
   'A': ['alpha', 'apple', 'arsenic', 'angel', 'android'],
   'B': [1, 2, 3, 4, 5],
   'C': ['coconut', 'curse', 'cassava', 'cuckoo', 'clarinet'],
   'D': [6, 7, 8, 9, 10]
   },
   )
df

Unnamed: 0,A,B,C,D
0,alpha,1,coconut,6
1,apple,2,curse,7
2,arsenic,3,cassava,8
3,angel,4,cuckoo,9
4,android,5,clarinet,10


In [55]:
print(df.loc[0:3, ['D']])

   D
0  6
1  7
2  8
3  9
