In [1]:
import numpy as np
import pandas as pd

In [2]:
s = pd.Series(
    np.random.randn(5),
    index=['a', 'b', 'c', 'd', 'e'],
    name='example'
)
s

a    0.827390
b   -1.430557
c   -0.910337
d    0.668856
e   -0.018971
Name: example, dtype: float64

In [3]:
pd.Series(5, index=['a', 'b', 'c', 'd', 'e'])

a    5
b    5
c    5
d    5
e    5
dtype: int64

In [4]:
s[0]

  s[0]


0.8273897114214432

In [5]:
s.head(3)

a    0.827390
b   -1.430557
c   -0.910337
Name: example, dtype: float64

In [6]:
s[:3]

a    0.827390
b   -1.430557
c   -0.910337
Name: example, dtype: float64

In [7]:
s[[4, 2, 1]]

  s[[4, 2, 1]]


e   -0.018971
c   -0.910337
b   -1.430557
Name: example, dtype: float64

In [8]:
print(s.values)
type(s.values)

[ 0.82738971 -1.43055689 -0.91033654  0.66885621 -0.01897141]


numpy.ndarray

In [9]:
s['e'] = 500
s

a      0.827390
b     -1.430557
c     -0.910337
d      0.668856
e    500.000000
Name: example, dtype: float64

In [10]:
s[[True, True, False, False, True]]

a      0.827390
b     -1.430557
e    500.000000
Name: example, dtype: float64

In [11]:
print(s > 0)
print(s[s > 0])

a     True
b    False
c    False
d     True
e     True
Name: example, dtype: bool
a      0.827390
d      0.668856
e    500.000000
Name: example, dtype: float64


In [12]:
s[s < 0] *= -1     # Data manipulation "In place"
s

a      0.827390
b      1.430557
c      0.910337
d      0.668856
e    500.000000
Name: example, dtype: float64

In [13]:
s + s, 2*s, s / 10, s - s

(a       1.654779
 b       2.861114
 c       1.820673
 d       1.337712
 e    1000.000000
 Name: example, dtype: float64,
 a       1.654779
 b       2.861114
 c       1.820673
 d       1.337712
 e    1000.000000
 Name: example, dtype: float64,
 a     0.082739
 b     0.143056
 c     0.091034
 d     0.066886
 e    50.000000
 Name: example, dtype: float64,
 a    0.0
 b    0.0
 c    0.0
 d    0.0
 e    0.0
 Name: example, dtype: float64)

In [15]:
print(type(np.exp(s)))
np.exp(s)

<class 'pandas.core.series.Series'>


a     2.287340e+00
b     4.181027e+00
c     2.485159e+00
d     1.952003e+00
e    1.403592e+217
Name: example, dtype: float64

In [16]:
s.mean()

100.7674278703551

In [17]:
s.median()

0.9103365444136752

In [18]:
d = {
    'one' : pd.Series([1., 2., 3.], index=['a', 'b', 'c']),
    'two' : pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])
}

df = pd.DataFrame(d)

df

Unnamed: 0,one,two
a,1.0,1.0
b,2.0,2.0
c,3.0,3.0
d,,4.0


In [19]:
d = {
    'one' : 'Hellow',
    'two' : np.array([1., 2., 3., 4.])
}

df = pd.DataFrame(d)

df

Unnamed: 0,one,two
0,Hellow,1.0
1,Hellow,2.0
2,Hellow,3.0
3,Hellow,4.0


In [20]:
df.columns =['1', '2']
df.index = ['a', 'b', 'c', 'd']

df

Unnamed: 0,1,2
a,Hellow,1.0
b,Hellow,2.0
c,Hellow,3.0
d,Hellow,4.0


In [21]:
d = {
    'one' : 'Hellow',
    'two' : np.array([1., 2., 3., 4.])
}

df = pd.DataFrame(d)

df

Unnamed: 0,one,two
0,Hellow,1.0
1,Hellow,2.0
2,Hellow,3.0
3,Hellow,4.0


In [22]:
df['two']

0    1.0
1    2.0
2    3.0
3    4.0
Name: two, dtype: float64

In [23]:
# del df['one'] 
df

Unnamed: 0,one,two
0,Hellow,1.0
1,Hellow,2.0
2,Hellow,3.0
3,Hellow,4.0


In [24]:
df['three'] = df['two'] + df['two']
df['four'] = 'four'
df

Unnamed: 0,one,two,three,four
0,Hellow,1.0,2.0,four
1,Hellow,2.0,4.0,four
2,Hellow,3.0,6.0,four
3,Hellow,4.0,8.0,four


In [25]:
df['five'] = df['four'][:2]
df

Unnamed: 0,one,two,three,four,five
0,Hellow,1.0,2.0,four,four
1,Hellow,2.0,4.0,four,four
2,Hellow,3.0,6.0,four,
3,Hellow,4.0,8.0,four,


In [26]:
df['two']

0    1.0
1    2.0
2    3.0
3    4.0
Name: two, dtype: float64

In [27]:
df[['two', 'four']]

Unnamed: 0,two,four
0,1.0,four
1,2.0,four
2,3.0,four
3,4.0,four


In [28]:
print(df.loc[0:2, 'two'])      # loc selects on the basis of labels, so 0, 1, 2 index would be selected
print(df.iloc[0:2, 0])

0    1.0
1    2.0
2    3.0
Name: two, dtype: float64
0    Hellow
1    Hellow
Name: one, dtype: object


In [29]:
df.iloc[-2:]

Unnamed: 0,one,two,three,four,five
2,Hellow,3.0,6.0,four,
3,Hellow,4.0,8.0,four,


In [30]:
df.copy()

Unnamed: 0,one,two,three,four,five
0,Hellow,1.0,2.0,four,four
1,Hellow,2.0,4.0,four,four
2,Hellow,3.0,6.0,four,
3,Hellow,4.0,8.0,four,


In [31]:
df['two'].astype(np.int64)

0    1
1    2
2    3
3    4
Name: two, dtype: int64

In [32]:
df.T

Unnamed: 0,0,1,2,3
one,Hellow,Hellow,Hellow,Hellow
two,1.0,2.0,3.0,4.0
three,2.0,4.0,6.0,8.0
four,four,four,four,four
five,four,four,,


In [33]:
df.head(2)

Unnamed: 0,one,two,three,four,five
0,Hellow,1.0,2.0,four,four
1,Hellow,2.0,4.0,four,four


In [34]:
df.tail(2)

Unnamed: 0,one,two,three,four,five
2,Hellow,3.0,6.0,four,
3,Hellow,4.0,8.0,four,


In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   one     4 non-null      object 
 1   two     4 non-null      float64
 2   three   4 non-null      float64
 3   four    4 non-null      object 
 4   five    2 non-null      object 
dtypes: float64(2), object(3)
memory usage: 292.0+ bytes


In [36]:
df.describe(include='all')

Unnamed: 0,one,two,three,four,five
count,4,4.0,4.0,4,2
unique,1,,,1,1
top,Hellow,,,four,four
freq,4,,,4,2
mean,,2.5,5.0,,
std,,1.290994,2.581989,,
min,,1.0,2.0,,
25%,,1.75,3.5,,
50%,,2.5,5.0,,
75%,,3.25,6.5,,


In [37]:
for i in range(20):
    df[i] = i
df.head()

Unnamed: 0,one,two,three,four,five,0,1,2,3,4,...,10,11,12,13,14,15,16,17,18,19
0,Hellow,1.0,2.0,four,four,0,1,2,3,4,...,10,11,12,13,14,15,16,17,18,19
1,Hellow,2.0,4.0,four,four,0,1,2,3,4,...,10,11,12,13,14,15,16,17,18,19
2,Hellow,3.0,6.0,four,,0,1,2,3,4,...,10,11,12,13,14,15,16,17,18,19
3,Hellow,4.0,8.0,four,,0,1,2,3,4,...,10,11,12,13,14,15,16,17,18,19


In [38]:
df.T

Unnamed: 0,0,1,2,3
one,Hellow,Hellow,Hellow,Hellow
two,1.0,2.0,3.0,4.0
three,2.0,4.0,6.0,8.0
four,four,four,four,four
five,four,four,,
0,0,0,0,0
1,1,1,1,1
2,2,2,2,2
3,3,3,3,3
4,4,4,4,4


In [40]:
pd.set_option

<pandas._config.config.CallableDynamicDoc at 0x1f92737afd0>

In [41]:
pd.set_option('display.max_rows', 100)
pd.set_option('display.precision', 7)


In [42]:
import seaborn as sns
import numpy as np
import pandas as pd

In [43]:
tips = sns.load_dataset('tips', cache=False)
print(tips.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
dtypes: category(4), float64(2), int64(1)
memory usage: 7.4 KB
None


In [44]:
tips[['total_bill', 'tip']].head()

Unnamed: 0,total_bill,tip
0,16.99,1.01
1,10.34,1.66
2,21.01,3.5
3,23.68,3.31
4,24.59,3.61


In [45]:
tips.iloc[3:5, 1:3]

Unnamed: 0,tip,sex
3,3.31,Male
4,3.61,Female


In [46]:
tips.loc[2:4, 'sex': 'smoker']

Unnamed: 0,sex,smoker
2,Male,No
3,Male,No
4,Female,No


In [47]:
print(tips['tip']>3)
len(tips[tips['tip']>3])

0      False
1      False
2       True
3       True
4       True
       ...  
239     True
240    False
241    False
242    False
243    False
Name: tip, Length: 244, dtype: bool


98

In [48]:
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [49]:
mi_tips = tips.groupby(['sex','smoker']).agg({'tip':'mean', 'size': 'mean'})
mi_tips

  mi_tips = tips.groupby(['sex','smoker']).agg({'tip':'mean', 'size': 'mean'})


Unnamed: 0_level_0,Unnamed: 1_level_0,tip,size
sex,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1
Male,Yes,3.0511667,2.5
Male,No,3.1134021,2.7113402
Female,Yes,2.9315152,2.2424242
Female,No,2.7735185,2.5925926


In [50]:
mi_tips['tip']['Male']['Yes']

3.0511666666666666

In [51]:
mi_tips.loc[('Male', 'Yes'),'tip']

3.0511666666666666

In [52]:
mi_tips.at[('Male', 'Yes'), 'tip']

3.0511666666666666

In [53]:
mi_tips.index

MultiIndex([(  'Male', 'Yes'),
            (  'Male',  'No'),
            ('Female', 'Yes'),
            ('Female',  'No')],
           names=['sex', 'smoker'])

In [54]:
ri_tips = mi_tips.reset_index()
ri_tips

Unnamed: 0,sex,smoker,tip,size
0,Male,Yes,3.0511667,2.5
1,Male,No,3.1134021,2.7113402
2,Female,Yes,2.9315152,2.2424242
3,Female,No,2.7735185,2.5925926


In [55]:
ri_tips[(ri_tips['smoker']=='No') & (ri_tips['sex']=='Male')]

Unnamed: 0,sex,smoker,tip,size
1,Male,No,3.1134021,2.7113402


In [56]:
out_tips = mi_tips.reset_index(level=0)
out_tips

Unnamed: 0_level_0,sex,tip,size
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Yes,Male,3.0511667,2.5
No,Male,3.1134021,2.7113402
Yes,Female,2.9315152,2.2424242
No,Female,2.7735185,2.5925926


In [57]:
ri_tips.set_index(['sex', 'smoker'])

Unnamed: 0_level_0,Unnamed: 1_level_0,tip,size
sex,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1
Male,Yes,3.0511667,2.5
Male,No,3.1134021,2.7113402
Female,Yes,2.9315152,2.2424242
Female,No,2.7735185,2.5925926


In [58]:
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [59]:
tips.at[0, 'total_bill'] = 9000
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,9000.0,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [60]:
tips.iat[0, 0]

9000.0

In [61]:
%%timeit
tips.at[0, 'total_bill'] = 6

23.6 µs ± 11.3 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [62]:
%%timeit
tips.loc['total_bill', 0] = 6

111 µs ± 34.1 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [63]:
arr = np.random.randn(25).reshape((5, 5))
arr.ndim

2

In [64]:
df = pd.DataFrame(arr)
df

Unnamed: 0,0,1,2,3,4
0,-1.6389166,-0.4730606,-0.6878206,0.541164,1.4533175
1,-0.0487939,-1.4993728,0.8126016,0.4646879,-0.5935705
2,0.2069876,-1.0100118,-0.1907908,0.7582058,0.5739132
3,0.9883699,0.415242,0.6765646,0.7087416,1.2646769
4,2.1918829,1.265093,1.7046001,0.8763487,0.7996762


In [65]:
df.where(df > 0)

Unnamed: 0,0,1,2,3,4
0,,,,0.541164,1.4533175
1,,,0.8126016,0.4646879,
2,0.2069876,,,0.7582058,0.5739132
3,0.9883699,0.415242,0.6765646,0.7087416,1.2646769
4,2.1918829,1.265093,1.7046001,0.8763487,0.7996762


In [66]:
df[df > 0]

Unnamed: 0,0,1,2,3,4
0,,,,0.541164,1.4533175
1,,,0.8126016,0.4646879,
2,0.2069876,,,0.7582058,0.5739132
3,0.9883699,0.415242,0.6765646,0.7087416,1.2646769
4,2.1918829,1.265093,1.7046001,0.8763487,0.7996762


In [67]:
s = pd.Series(["Sam",np.nan,"Tim","Kim"])
s

0    Sam
1    NaN
2    Tim
3    Kim
dtype: object

In [68]:
print(s.isnull())
s.isnull().sum()

0    False
1     True
2    False
3    False
dtype: bool


1

In [69]:
print(s.notnull())
s.notnull().sum()

0     True
1    False
2     True
3     True
dtype: bool


3

In [70]:
s[3] = None
s.isnull()

0    False
1     True
2    False
3     True
dtype: bool

In [71]:
s

0     Sam
1     NaN
2     Tim
3    None
dtype: object

In [72]:
s[0] = ""
s.isnull()

0    False
1     True
2    False
3     True
dtype: bool

In [73]:
s[0] = "Sam"
s.dropna()

0    Sam
2    Tim
dtype: object

In [74]:
from numpy import nan as NA

In [75]:
df=pd.DataFrame([[1,2,3],[4,NA,5],
                 [NA,NA,NA]])
df

Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,4.0,,5.0
2,,,


In [76]:
df.dropna()

Unnamed: 0,0,1,2
0,1.0,2.0,3.0


In [77]:
df.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,4.0,,5.0


In [78]:
df.dropna(thresh=2) # Only retains those rows which have atleast 2 non-null values

Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,4.0,,5.0


In [79]:
df


Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,4.0,,5.0
2,,,


In [80]:
df.fillna({0:15, 1:20, 2:25})

Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,4.0,20.0,5.0
2,15.0,20.0,25.0


In [81]:
df.fillna(method="ffill")

  df.fillna(method="ffill")


Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,4.0,2.0,5.0
2,4.0,2.0,5.0


In [82]:
df.fillna(method="ffill", limit=1)

  df.fillna(method="ffill", limit=1)


Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,4.0,2.0,5.0
2,4.0,,5.0


In [83]:
s1=pd.Series(np.arange(4),
             index=["a","c","d","e"])
s2=pd.Series(np.arange(5),
             index=["a","c","e","f","g"])

In [84]:
s1

a    0
c    1
d    2
e    3
dtype: int32

In [85]:
s2

a    0
c    1
e    2
f    3
g    4
dtype: int32

In [86]:
s1 + s2

a    0.0
c    2.0
d    NaN
e    5.0
f    NaN
g    NaN
dtype: float64

In [87]:
df1=pd.DataFrame(
    np.arange(6).reshape(2,3),
    columns=list("ABC"),
    index=["Tim","Tom"])
df2=pd.DataFrame(
    np.arange(9).reshape(3,3),
    columns=list("ACD"),
    index=["Tim","Kate","Tom"])

In [88]:
df1.at['Tim', 'A'] = np.nan
df1

Unnamed: 0,A,B,C
Tim,,1,2
Tom,3.0,4,5


In [89]:
df2.at['Tim', 'A'] = np.nan
df2

Unnamed: 0,A,C,D
Tim,,1,2
Kate,3.0,4,5
Tom,6.0,7,8


In [90]:
df1 + df2

Unnamed: 0,A,B,C,D
Kate,,,,
Tim,,,3.0,
Tom,9.0,,12.0,


In [91]:
df1.add(df2, fill_value=0) # Take good example such that fill_value = 0

Unnamed: 0,A,B,C,D
Kate,3.0,,4.0,5.0
Tim,,1.0,3.0,2.0
Tom,9.0,4.0,12.0,8.0


In [92]:
1/df1


Unnamed: 0,A,B,C
Tim,,1.0,0.5
Tom,0.3333333,0.25,0.2


In [93]:
3*df

Unnamed: 0,0,1,2
0,3.0,6.0,9.0
1,12.0,,15.0
2,,,
