In [1]:
import numpy as np
import pandas as pd

Series

In [2]:
data = [10,20,30,40]
s = pd.Series(data)
s

0    10
1    20
2    30
3    40
dtype: int64

In [3]:
s[0]

np.int64(10)

In [4]:
s[[0,2]]

0    10
2    30
dtype: int64

Custom indexing

In [5]:
s = pd.Series([10, 20, 30, 40], index=['a', 'b', 'c', 'd'])
s

a    10
b    20
c    30
d    40
dtype: int64

In [6]:
s['a']

np.int64(10)

In [7]:
s['b'] = 50

In [8]:
s

a    10
b    50
c    30
d    40
dtype: int64

In [9]:
s1 = pd.Series([10,20,30], index = ['alice', 'bob', 'charlie'])
s2 = pd.Series([1,2,3,4], index = ['bob','charlie','alice', 'dave'])
print(s1+s2)

alice      13.0
bob        21.0
charlie    32.0
dave        NaN
dtype: float64


In [10]:
s.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [11]:
s+5    #vectorized operations

a    15
b    55
c    35
d    45
dtype: int64

In [12]:
s.values

array([10, 50, 30, 40])

In [13]:
s[s>20]

b    50
c    30
d    40
dtype: int64

In [14]:
s.tolist()

[10, 50, 30, 40]

DataFrame

In [15]:
count = {'USA' : 50, 'India': 500, 'UK' : 80, 'France': 90}
head = {'USA' : 'NYC', 'India': 'Pune', 'UK' : 'London', 'France': 'Paris'}

In [16]:
df = pd.DataFrame({'Headoffice':head,'Employee count':count})
df

Unnamed: 0,Headoffice,Employee count
USA,NYC,50
India,Pune,500
UK,London,80
France,Paris,90


In [17]:
df.loc['USA']

Headoffice        NYC
Employee count     50
Name: USA, dtype: object

In [18]:
df.loc['India':'France']       #last value is included

Unnamed: 0,Headoffice,Employee count
India,Pune,500
UK,London,80
France,Paris,90


In [19]:
df.loc['USA','Headoffice']             #row USA, column Headoffice

'NYC'

In [20]:
df[df['Employee count'] < 100]            #iloc does not support this

Unnamed: 0,Headoffice,Employee count
USA,NYC,50
UK,London,80
France,Paris,90


In [21]:
df.loc[['UK','India']]

Unnamed: 0,Headoffice,Employee count
UK,London,80
India,Pune,500


In [22]:
df.loc['India': 'France','Employee count']

India     500
UK         80
France     90
Name: Employee count, dtype: int64

In [23]:
df.iloc[1]

Headoffice        Pune
Employee count     500
Name: India, dtype: object

In [24]:
df.iloc[0:2]

Unnamed: 0,Headoffice,Employee count
USA,NYC,50
India,Pune,500


In [25]:
df.iloc[1,1]            #pune location employee count

np.int64(500)

In [26]:
df.iloc[1:3,[0,1]]

Unnamed: 0,Headoffice,Employee count
India,Pune,500
UK,London,80


In [27]:
df= pd.DataFrame(data=[[500,35,23],[600,22],[300,25,25]],
                index=['Tata', 'Maruti', 'Honda'],
                columns=['Revenue', 'Market Share', 'Profit'])
df

Unnamed: 0,Revenue,Market Share,Profit
Tata,500,35,23.0
Maruti,600,22,
Honda,300,25,25.0


In [28]:
df.iloc[-1]

Revenue         300.0
Market Share     25.0
Profit           25.0
Name: Honda, dtype: float64

In [29]:
df.head(2)

Unnamed: 0,Revenue,Market Share,Profit
Tata,500,35,23.0
Maruti,600,22,


In [30]:
df.tail(1)

Unnamed: 0,Revenue,Market Share,Profit
Honda,300,25,25.0


In [31]:
df.shape

(3, 3)

In [32]:
df.reset_index(inplace = True)

In [33]:
df.rename(columns = {'index':'Company'}, inplace = True)
df

Unnamed: 0,Company,Revenue,Market Share,Profit
0,Tata,500,35,23.0
1,Maruti,600,22,
2,Honda,300,25,25.0


In [34]:
df.columns

Index(['Company', 'Revenue', 'Market Share', 'Profit'], dtype='object')

In [35]:
df.set_index('Company',inplace = True)
df

Unnamed: 0_level_0,Revenue,Market Share,Profit
Company,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Tata,500,35,23.0
Maruti,600,22,
Honda,300,25,25.0


In [36]:
df.index

Index(['Tata', 'Maruti', 'Honda'], dtype='object', name='Company')

In [37]:
df.keys()

Index(['Revenue', 'Market Share', 'Profit'], dtype='object')

In [38]:
q1 = {'Japan': 80, 'China':450, 'India':200,'US':250}
q2 = {'Brazil': 100, 'China':500, 'India':210,'US':260}
sales_Q1 = pd.Series(q1)
sales_Q2 = pd.Series(q2)

In [39]:
print(sales_Q1+sales_Q2)

Brazil      NaN
China     950.0
India     410.0
Japan       NaN
US        510.0
dtype: float64


In [40]:
print(sales_Q1.add(sales_Q2, fill_value = 0))     #handling missing values

Brazil    100.0
China     950.0
India     410.0
Japan      80.0
US        510.0
dtype: float64


Creating dataframe from scratch

In [41]:
np.random.seed(42)
scores = np.random.randint(1, 101,(4,3))
scores

array([[ 52,  93,  15],
       [ 72,  61,  21],
       [ 83,  87,  75],
       [ 75,  88, 100]], dtype=int32)

In [42]:
players = ['Virat','Rohit','MS-Dhoni','KL Rahul']
matches = ['Match 1', 'Match 2', 'Match 3']

In [43]:
df = pd.DataFrame(data = scores, columns = matches,index = players)
df

Unnamed: 0,Match 1,Match 2,Match 3
Virat,52,93,15
Rohit,72,61,21
MS-Dhoni,83,87,75
KL Rahul,75,88,100


In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, Virat to KL Rahul
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   Match 1  4 non-null      int32
 1   Match 2  4 non-null      int32
 2   Match 3  4 non-null      int32
dtypes: int32(3)
memory usage: 80.0+ bytes


In [45]:
df.iloc[0]

Match 1    52
Match 2    93
Match 3    15
Name: Virat, dtype: int32

In [46]:
df.iloc[[0]]

Unnamed: 0,Match 1,Match 2,Match 3
Virat,52,93,15


In [47]:
len(df)

4

In [48]:
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Match 1,4.0,70.5,13.178265,52.0,67.0,73.5,77.0,83.0
Match 2,4.0,82.25,14.407753,61.0,80.5,87.5,89.25,93.0
Match 3,4.0,52.75,41.475897,15.0,19.5,48.0,81.25,100.0


In [49]:
df.sort_values(by= 'Match 1')

Unnamed: 0,Match 1,Match 2,Match 3
Virat,52,93,15
Rohit,72,61,21
KL Rahul,75,88,100
MS-Dhoni,83,87,75


In [50]:
df['Match 1'].sort_values()

Virat       52
Rohit       72
KL Rahul    75
MS-Dhoni    83
Name: Match 1, dtype: int32

In [51]:
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'Age': [25, 30, 35, 'Unknown'],
    'Salary': [50000, 'Unknown', 70000, 80000]
}
df2 = pd.DataFrame(data)
df2

Unnamed: 0,Name,Age,Salary
0,Alice,25,50000
1,Bob,30,Unknown
2,Charlie,35,70000
3,David,Unknown,80000


In [52]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    4 non-null      object
 1   Age     4 non-null      object
 2   Salary  4 non-null      object
dtypes: object(3)
memory usage: 228.0+ bytes


In [53]:
pd.to_numeric(df2['Age'], errors = 'ignore')         #converts to float or int
df2.info()
df2

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    4 non-null      object
 1   Age     4 non-null      object
 2   Salary  4 non-null      object
dtypes: object(3)
memory usage: 228.0+ bytes


  pd.to_numeric(df2['Age'], errors = 'ignore')         #converts to float or int


Unnamed: 0,Name,Age,Salary
0,Alice,25,50000
1,Bob,30,Unknown
2,Charlie,35,70000
3,David,Unknown,80000


In [54]:
cols = ['Age','Salary']
df2[cols] = df2[cols].apply(pd.to_numeric,errors = 'coerce')
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Name    4 non-null      object 
 1   Age     3 non-null      float64
 2   Salary  3 non-null      float64
dtypes: float64(2), object(1)
memory usage: 228.0+ bytes


In [55]:
df2[cols] = df2[cols].fillna(0).astype(int)

df2.dtypes

Name      object
Age        int64
Salary     int64
dtype: object

In [56]:
df2

Unnamed: 0,Name,Age,Salary
0,Alice,25,50000
1,Bob,30,0
2,Charlie,35,70000
3,David,0,80000


In [57]:
df2.sort_index(ascending=False)

Unnamed: 0,Name,Age,Salary
3,David,0,80000
2,Charlie,35,70000
1,Bob,30,0
0,Alice,25,50000


In [58]:
df2.sort_index(axis=1)

Unnamed: 0,Age,Name,Salary
0,25,Alice,50000
1,30,Bob,0
2,35,Charlie,70000
3,0,David,80000


In [59]:
df2.sort_index(axis=0)  #default

Unnamed: 0,Name,Age,Salary
0,Alice,25,50000
1,Bob,30,0
2,Charlie,35,70000
3,David,0,80000
