In [1]:
import pandas as pd
pd.__version__

'2.2.3'

In [2]:
from sklearn.datasets import load_diabetes

diabetes = load_diabetes(as_frame=True)
print(type(diabetes['data']))

<class 'pandas.core.frame.DataFrame'>


In [3]:
?load_diabetes

[31mSignature:[39m load_diabetes(*, return_X_y=[38;5;28;01mFalse[39;00m, as_frame=[38;5;28;01mFalse[39;00m, scaled=[38;5;28;01mTrue[39;00m)
[31mDocstring:[39m
Load and return the diabetes dataset (regression).

Samples total    442
Dimensionality   10
Features         real, -.2 < x < .2
Targets          integer 25 - 346

.. note::
   The meaning of each feature (i.e. `feature_names`) might be unclear
   (especially for `ltg`) as the documentation of the original dataset is
   not explicit. We provide information that seems correct in regard with
   the scientific literature in this field of research.

Read more in the :ref:`User Guide <diabetes_dataset>`.

Parameters
----------
return_X_y : bool, default=False
    If True, returns ``(data, target)`` instead of a Bunch object.
    See below for more information about the `data` and `target` object.

    .. versionadded:: 0.18

as_frame : bool, default=False
    If True, the data is a pandas DataFrame including columns with
   

In [4]:
print(diabetes.DESCR)

.. _diabetes_dataset:

Diabetes dataset
----------------

Ten baseline variables, age, sex, body mass index, average blood
pressure, and six blood serum measurements were obtained for each of n =
442 diabetes patients, as well as the response of interest, a
quantitative measure of disease progression one year after baseline.

**Data Set Characteristics:**

:Number of Instances: 442

:Number of Attributes: First 10 columns are numeric predictive values

:Target: Column 11 is a quantitative measure of disease progression one year after baseline

:Attribute Information:
    - age     age in years
    - sex
    - bmi     body mass index
    - bp      average blood pressure
    - s1      tc, total serum cholesterol
    - s2      ldl, low-density lipoproteins
    - s3      hdl, high-density lipoproteins
    - s4      tch, total cholesterol / HDL
    - s5      ltg, possibly log of serum triglycerides level
    - s6      glu, blood sugar level

Note: Each of these 10 feature variables have bee

In [5]:
df = diabetes['data']

In [6]:
# Creating pandas dataframe from series

cities = pd.Series(['Mumbai', 'Bangalore', 'Chennai', 'Delhi'])
population = pd.Series([17000000, 13000000, 6000000])

In [7]:
city_info_df = pd.DataFrame({'City': cities, 'Population': population})

In [8]:
type(city_info_df)

pandas.core.frame.DataFrame

In [9]:
city_info_df

Unnamed: 0,City,Population
0,Mumbai,17000000.0
1,Bangalore,13000000.0
2,Chennai,6000000.0
3,Delhi,


In [10]:
# Exploring data in dataframe

df.shape

(442, 10)

In [11]:
df.columns

Index(['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6'], dtype='object')

In [12]:
list(df.columns)

['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 442 entries, 0 to 441
Data columns (total 10 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   age     442 non-null    float64
 1   sex     442 non-null    float64
 2   bmi     442 non-null    float64
 3   bp      442 non-null    float64
 4   s1      442 non-null    float64
 5   s2      442 non-null    float64
 6   s3      442 non-null    float64
 7   s4      442 non-null    float64
 8   s5      442 non-null    float64
 9   s6      442 non-null    float64
dtypes: float64(10)
memory usage: 34.7 KB


In [14]:
? df.head

[31mSignature:[39m  df.head(n: [33m'int'[39m = [32m5[39m) -> [33m'Self'[39m
[31mDocstring:[39m
Return the first `n` rows.

This function returns the first `n` rows for the object based
on position. It is useful for quickly testing if your object
has the right type of data in it.

For negative values of `n`, this function returns all rows except
the last `|n|` rows, equivalent to ``df[:n]``.

If n is larger than the number of rows, this function returns all rows.

Parameters
----------
n : int, default 5
    Number of rows to select.

Returns
-------
same type as caller
    The first `n` rows of the caller object.

See Also
--------
DataFrame.tail: Returns the last `n` rows.

Examples
--------
>>> df = pd.DataFrame({'animal': ['alligator', 'bee', 'falcon', 'lion',
...                    'monkey', 'parrot', 'shark', 'whale', 'zebra']})
>>> df
      animal
0  alligator
1        bee
2     falcon
3       lion
4     monkey
5     parrot
6      shark
7      whale
8      zebra

Viewin

In [15]:
df.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204
2,0.085299,0.05068,0.044451,-0.00567,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.02593
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641


In [16]:
df.tail()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
437,0.041708,0.05068,0.019662,0.059744,-0.005697,-0.002566,-0.028674,-0.002592,0.031193,0.007207
438,-0.005515,0.05068,-0.015906,-0.067642,0.049341,0.079165,-0.028674,0.034309,-0.018114,0.044485
439,0.041708,0.05068,-0.015906,0.017293,-0.037344,-0.01384,-0.024993,-0.01108,-0.046883,0.015491
440,-0.045472,-0.044642,0.039062,0.001215,0.016318,0.015283,-0.028674,0.02656,0.044529,-0.02593
441,-0.045472,-0.044642,-0.07303,-0.081413,0.08374,0.027809,0.173816,-0.039493,-0.004222,0.003064


In [17]:
# Summary Statistics on the dataframe

df.describe()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
count,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0
mean,-2.511817e-19,1.23079e-17,-2.245564e-16,-4.79757e-17,-1.3814990000000001e-17,3.9184340000000004e-17,-5.777179e-18,-9.04254e-18,9.293722000000001e-17,1.130318e-17
std,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905
min,-0.1072256,-0.04464164,-0.0902753,-0.1123988,-0.1267807,-0.1156131,-0.1023071,-0.0763945,-0.1260971,-0.1377672
25%,-0.03729927,-0.04464164,-0.03422907,-0.03665608,-0.03424784,-0.0303584,-0.03511716,-0.03949338,-0.03324559,-0.03317903
50%,0.00538306,-0.04464164,-0.007283766,-0.005670422,-0.004320866,-0.003819065,-0.006584468,-0.002592262,-0.001947171,-0.001077698
75%,0.03807591,0.05068012,0.03124802,0.03564379,0.02835801,0.02984439,0.0293115,0.03430886,0.03243232,0.02791705
max,0.1107267,0.05068012,0.1705552,0.1320436,0.1539137,0.198788,0.1811791,0.1852344,0.1335973,0.1356118


In [18]:
df.describe(percentiles=[0.2, 0.6, 0.8])

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
count,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0
mean,-2.511817e-19,1.23079e-17,-2.245564e-16,-4.79757e-17,-1.3814990000000001e-17,3.9184340000000004e-17,-5.777179e-18,-9.04254e-18,9.293722000000001e-17,1.130318e-17
std,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905
min,-0.1072256,-0.04464164,-0.0902753,-0.1123988,-0.1267807,-0.1156131,-0.1023071,-0.0763945,-0.1260971,-0.1377672
20%,-0.04547248,-0.04464164,-0.04048038,-0.04009893,-0.03871969,-0.03695017,-0.03971921,-0.03949338,-0.04117617,-0.03835666
50%,0.00538306,-0.04464164,-0.007283766,-0.005670422,-0.004320866,-0.003819065,-0.006584468,-0.002592262,-0.001947171,-0.001077698
60%,0.01628068,0.05068012,0.005218854,0.008100982,0.00806271,0.008706873,0.008142084,-0.002592262,0.01255119,0.007206516
80%,0.04170844,0.05068012,0.04229559,0.04941519,0.03943444,0.03952068,0.03759519,0.03430886,0.03885335,0.03620126
max,0.1107267,0.05068012,0.1705552,0.1320436,0.1539137,0.198788,0.1811791,0.1852344,0.1335973,0.1356118


In [19]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,442.0,-2.511817e-19,0.047619,-0.107226,-0.037299,0.005383,0.038076,0.110727
sex,442.0,1.23079e-17,0.047619,-0.044642,-0.044642,-0.044642,0.05068,0.05068
bmi,442.0,-2.245564e-16,0.047619,-0.090275,-0.034229,-0.007284,0.031248,0.170555
bp,442.0,-4.79757e-17,0.047619,-0.112399,-0.036656,-0.00567,0.035644,0.132044
s1,442.0,-1.3814990000000001e-17,0.047619,-0.126781,-0.034248,-0.004321,0.028358,0.153914
s2,442.0,3.9184340000000004e-17,0.047619,-0.115613,-0.030358,-0.003819,0.029844,0.198788
s3,442.0,-5.777179e-18,0.047619,-0.102307,-0.035117,-0.006584,0.029312,0.181179
s4,442.0,-9.04254e-18,0.047619,-0.076395,-0.039493,-0.002592,0.034309,0.185234
s5,442.0,9.293722000000001e-17,0.047619,-0.126097,-0.033246,-0.001947,0.032432,0.133597
s6,442.0,1.130318e-17,0.047619,-0.137767,-0.033179,-0.001078,0.027917,0.135612


In [20]:
# Selection

df.columns

Index(['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6'], dtype='object')

In [21]:
df['age']

0      0.038076
1     -0.001882
2      0.085299
3     -0.089063
4      0.005383
         ...   
437    0.041708
438   -0.005515
439    0.041708
440   -0.045472
441   -0.045472
Name: age, Length: 442, dtype: float64

In [22]:
type(df['age'])

pandas.core.series.Series

In [23]:
df['age'][0]

np.float64(0.038075906433423026)

In [24]:
df['age'][:5]

0    0.038076
1   -0.001882
2    0.085299
3   -0.089063
4    0.005383
Name: age, dtype: float64

In [25]:
df['age'][-5:]

437    0.041708
438   -0.005515
439    0.041708
440   -0.045472
441   -0.045472
Name: age, dtype: float64

In [26]:
df['age'][100:200]

100    0.016281
101    0.016281
102   -0.092695
103    0.059871
104   -0.027310
         ...   
195    0.027178
196   -0.023677
197    0.048974
198   -0.052738
199    0.041708
Name: age, Length: 100, dtype: float64

In [27]:
df[['age', 'sex']]

Unnamed: 0,age,sex
0,0.038076,0.050680
1,-0.001882,-0.044642
2,0.085299,0.050680
3,-0.089063,-0.044642
4,0.005383,-0.044642
...,...,...
437,0.041708,0.050680
438,-0.005515,0.050680
439,0.041708,0.050680
440,-0.045472,-0.044642


In [28]:
df[['age', 'sex']][:5]

Unnamed: 0,age,sex
0,0.038076,0.05068
1,-0.001882,-0.044642
2,0.085299,0.05068
3,-0.089063,-0.044642
4,0.005383,-0.044642


In [29]:
df[['age', 'sex']]

Unnamed: 0,age,sex
0,0.038076,0.050680
1,-0.001882,-0.044642
2,0.085299,0.050680
3,-0.089063,-0.044642
4,0.005383,-0.044642
...,...,...
437,0.041708,0.050680
438,-0.005515,0.050680
439,0.041708,0.050680
440,-0.045472,-0.044642


In [30]:
df.iloc[441]

age   -0.045472
sex   -0.044642
bmi   -0.073030
bp    -0.081413
s1     0.083740
s2     0.027809
s3     0.173816
s4    -0.039493
s5    -0.004222
s6     0.003064
Name: 441, dtype: float64

In [31]:
df.iloc[0]

age    0.038076
sex    0.050680
bmi    0.061696
bp     0.021872
s1    -0.044223
s2    -0.034821
s3    -0.043401
s4    -0.002592
s5     0.019907
s6    -0.017646
Name: 0, dtype: float64

In [32]:
df.loc[0]

age    0.038076
sex    0.050680
bmi    0.061696
bp     0.021872
s1    -0.044223
s2    -0.034821
s3    -0.043401
s4    -0.002592
s5     0.019907
s6    -0.017646
Name: 0, dtype: float64

In [33]:
df.head(n=5)

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204
2,0.085299,0.05068,0.044451,-0.00567,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.02593
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641


In [34]:
df.loc[4, 'age']

np.float64(0.005383060374248237)

In [35]:
df.loc[4, ['age', 'sex']]

age    0.005383
sex   -0.044642
Name: 4, dtype: float64

In [36]:
df.iloc[4, [1, 5, 7, 9]]

sex   -0.044642
s2     0.015596
s4    -0.002592
s6    -0.046641
Name: 4, dtype: float64

In [37]:
rows_condition_met = df.age > 5.383060e-03

In [38]:
df.loc[rows_condition_met]

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,0.038076,0.050680,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646
2,0.085299,0.050680,0.044451,-0.005670,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.025930
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641
7,0.063504,0.050680,-0.001895,0.066629,0.090620,0.108914,0.022869,0.017703,-0.035816,0.003064
8,0.041708,0.050680,0.061696,-0.040099,-0.013953,0.006202,-0.028674,-0.002592,-0.014960,0.011349
...,...,...,...,...,...,...,...,...,...,...
431,0.070769,0.050680,-0.030996,0.021872,-0.037344,-0.047034,0.033914,-0.039493,-0.014960,-0.001078
432,0.009016,-0.044642,0.055229,-0.005670,0.057597,0.044719,-0.002903,0.023239,0.055686,0.106617
434,0.016281,-0.044642,0.001339,0.008101,0.005311,0.010899,0.030232,-0.039493,-0.045424,0.032059
437,0.041708,0.050680,0.019662,0.059744,-0.005697,-0.002566,-0.028674,-0.002592,0.031193,0.007207


In [39]:
df.loc[df.age > 5.383060e-03]

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,0.038076,0.050680,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646
2,0.085299,0.050680,0.044451,-0.005670,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.025930
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641
7,0.063504,0.050680,-0.001895,0.066629,0.090620,0.108914,0.022869,0.017703,-0.035816,0.003064
8,0.041708,0.050680,0.061696,-0.040099,-0.013953,0.006202,-0.028674,-0.002592,-0.014960,0.011349
...,...,...,...,...,...,...,...,...,...,...
431,0.070769,0.050680,-0.030996,0.021872,-0.037344,-0.047034,0.033914,-0.039493,-0.014960,-0.001078
432,0.009016,-0.044642,0.055229,-0.005670,0.057597,0.044719,-0.002903,0.023239,0.055686,0.106617
434,0.016281,-0.044642,0.001339,0.008101,0.005311,0.010899,0.030232,-0.039493,-0.045424,0.032059
437,0.041708,0.050680,0.019662,0.059744,-0.005697,-0.002566,-0.028674,-0.002592,0.031193,0.007207


In [40]:
age_df_temp = df.loc[df.age < 5.383060e-03]

In [41]:
age_df_temp = df[df.age < 5.383060e-03]

In [42]:
age_df_temp.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362
5,-0.092695,-0.044642,-0.040696,-0.019442,-0.068991,-0.079288,0.041277,-0.076395,-0.041176,-0.096346
6,-0.045472,0.05068,-0.047163,-0.015999,-0.040096,-0.0248,0.000779,-0.039493,-0.062917,-0.038357
9,-0.0709,-0.044642,0.039062,-0.033213,-0.012577,-0.034508,-0.024993,-0.002592,0.067737,-0.013504


In [43]:
age_df_temp.iloc[2]

age   -0.092695
sex   -0.044642
bmi   -0.040696
bp    -0.019442
s1    -0.068991
s2    -0.079288
s3     0.041277
s4    -0.076395
s5    -0.041176
s6    -0.096346
Name: 5, dtype: float64

In [44]:
age_df_temp.loc[1]

age   -0.001882
sex   -0.044642
bmi   -0.051474
bp    -0.026328
s1    -0.008449
s2    -0.019163
s3     0.074412
s4    -0.039493
s5    -0.068332
s6    -0.092204
Name: 1, dtype: float64

In [45]:
age_df_temp = df.loc[(df.age < 5.383060e-03) & (df.sex > -4.464164e-02 )]

In [46]:
age_df_temp.shape

(214, 10)

In [47]:
age_df_temp

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362
5,-0.092695,-0.044642,-0.040696,-0.019442,-0.068991,-0.079288,0.041277,-0.076395,-0.041176,-0.096346
6,-0.045472,0.050680,-0.047163,-0.015999,-0.040096,-0.024800,0.000779,-0.039493,-0.062917,-0.038357
9,-0.070900,-0.044642,0.039062,-0.033213,-0.012577,-0.034508,-0.024993,-0.002592,0.067737,-0.013504
...,...,...,...,...,...,...,...,...,...,...
435,-0.012780,-0.044642,-0.023451,-0.040099,-0.016704,0.004636,-0.017629,-0.002592,-0.038460,-0.038357
436,-0.056370,-0.044642,-0.074108,-0.050427,-0.024960,-0.047034,0.092820,-0.076395,-0.061176,-0.046641
438,-0.005515,0.050680,-0.015906,-0.067642,0.049341,0.079165,-0.028674,0.034309,-0.018114,0.044485
440,-0.045472,-0.044642,0.039062,0.001215,0.016318,0.015283,-0.028674,0.026560,0.044529,-0.025930


In [48]:
list("ABCD")

['A', 'B', 'C', 'D']

In [49]:
list("abcdefghi")

['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i']

In [50]:
import numpy as np

In [51]:
another_df = pd.DataFrame(
    np.random.rand(100, 4),
    index=range(10, 110),
    columns=list("ABCD"))

In [52]:
another_df.head()

Unnamed: 0,A,B,C,D
10,0.028101,0.202399,0.398662,0.311933
11,0.32877,0.831308,0.321757,0.037866
12,0.119224,0.413598,0.72371,0.690164
13,0.909739,0.606557,0.189842,0.397427
14,0.397407,0.473029,0.130702,0.212838


In [53]:
another_df.tail()

Unnamed: 0,A,B,C,D
105,0.532857,0.094891,0.798958,0.062005
106,0.608337,0.953542,0.913845,0.661291
107,0.31471,0.215208,0.410598,0.683664
108,0.432391,0.577163,0.212313,0.016635
109,0.060532,0.32563,0.327465,0.711436


In [54]:
df = pd.DataFrame(np.random.rand(9, 4), index=list("abcdefghi"), columns=list("ABCD"))

In [55]:
df.shape

(9, 4)

In [56]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [57]:
selector = lambda df: df['A'] > 0

In [58]:
selector

<function __main__.<lambda>(df)>

In [59]:
df.loc[selector]

Unnamed: 0,A,B,C,D
a,0.939647,0.175265,0.188185,0.475189
b,0.01138,0.087344,0.285504,0.937199
c,0.10088,0.099863,0.622481,0.453287
d,0.227133,0.815822,0.013137,0.384672
e,0.482952,0.05489,0.219516,0.943858
f,0.741893,0.035252,0.858849,0.198183
g,0.948503,0.528745,0.373274,0.807166
h,0.868301,0.984955,0.696381,0.22898
i,0.096378,0.864338,0.458358,0.545292


In [60]:
selector = lambda df: df['A'] > 0.5

In [61]:
selector = lambda df: (df['A'] > 0.5)&(df['B'] < 0.2)

In [62]:
df.loc[selector]

Unnamed: 0,A,B,C,D
a,0.939647,0.175265,0.188185,0.475189
f,0.741893,0.035252,0.858849,0.198183


In [63]:
condition_for_selection = (df['A'] > 0.5)&(df['B'] < 0.2)

In [64]:
condition_for_selection

a     True
b    False
c    False
d    False
e    False
f     True
g    False
h    False
i    False
dtype: bool

In [65]:
df[condition_for_selection]

Unnamed: 0,A,B,C,D
a,0.939647,0.175265,0.188185,0.475189
f,0.741893,0.035252,0.858849,0.198183


In [66]:
condition_for_selection = (df['A'] > 0.5) | ~(df['B'] < 0.2)

In [67]:
df[condition_for_selection]

Unnamed: 0,A,B,C,D
a,0.939647,0.175265,0.188185,0.475189
d,0.227133,0.815822,0.013137,0.384672
f,0.741893,0.035252,0.858849,0.198183
g,0.948503,0.528745,0.373274,0.807166
h,0.868301,0.984955,0.696381,0.22898
i,0.096378,0.864338,0.458358,0.545292


In [68]:
# Adding a column in the dataframe

df['E'] = df['A']*100

In [69]:
df

Unnamed: 0,A,B,C,D,E
a,0.939647,0.175265,0.188185,0.475189,93.964698
b,0.01138,0.087344,0.285504,0.937199,1.13804
c,0.10088,0.099863,0.622481,0.453287,10.088008
d,0.227133,0.815822,0.013137,0.384672,22.71335
e,0.482952,0.05489,0.219516,0.943858,48.295222
f,0.741893,0.035252,0.858849,0.198183,74.189342
g,0.948503,0.528745,0.373274,0.807166,94.850276
h,0.868301,0.984955,0.696381,0.22898,86.830078
i,0.096378,0.864338,0.458358,0.545292,9.637759


In [70]:
df["F"] = df["A"] + df["C"]

In [71]:
df

Unnamed: 0,A,B,C,D,E,F
a,0.939647,0.175265,0.188185,0.475189,93.964698,1.127832
b,0.01138,0.087344,0.285504,0.937199,1.13804,0.296885
c,0.10088,0.099863,0.622481,0.453287,10.088008,0.723361
d,0.227133,0.815822,0.013137,0.384672,22.71335,0.24027
e,0.482952,0.05489,0.219516,0.943858,48.295222,0.702468
f,0.741893,0.035252,0.858849,0.198183,74.189342,1.600742
g,0.948503,0.528745,0.373274,0.807166,94.850276,1.321776
h,0.868301,0.984955,0.696381,0.22898,86.830078,1.564682
i,0.096378,0.864338,0.458358,0.545292,9.637759,0.554736


In [72]:
criteria = df['A'] < 0.2

In [73]:
criteria

a    False
b     True
c     True
d    False
e    False
f    False
g    False
h    False
i     True
Name: A, dtype: bool

In [74]:
df.loc[criteria, 'A'] = 0

In [75]:
df

Unnamed: 0,A,B,C,D,E,F
a,0.939647,0.175265,0.188185,0.475189,93.964698,1.127832
b,0.0,0.087344,0.285504,0.937199,1.13804,0.296885
c,0.0,0.099863,0.622481,0.453287,10.088008,0.723361
d,0.227133,0.815822,0.013137,0.384672,22.71335,0.24027
e,0.482952,0.05489,0.219516,0.943858,48.295222,0.702468
f,0.741893,0.035252,0.858849,0.198183,74.189342,1.600742
g,0.948503,0.528745,0.373274,0.807166,94.850276,1.321776
h,0.868301,0.984955,0.696381,0.22898,86.830078,1.564682
i,0.0,0.864338,0.458358,0.545292,9.637759,0.554736


In [76]:
cities = ['Mumbai', 'Delhi', 'Chennai', 'Kolkata', 'Bengalure', 'Hyderabad', 'Pune', 'Ahmedabad', 'Indore']

In [77]:
df['city'] = cities

In [78]:
df_copy = df.copy()

In [79]:
df

Unnamed: 0,A,B,C,D,E,F,city
a,0.939647,0.175265,0.188185,0.475189,93.964698,1.127832,Mumbai
b,0.0,0.087344,0.285504,0.937199,1.13804,0.296885,Delhi
c,0.0,0.099863,0.622481,0.453287,10.088008,0.723361,Chennai
d,0.227133,0.815822,0.013137,0.384672,22.71335,0.24027,Kolkata
e,0.482952,0.05489,0.219516,0.943858,48.295222,0.702468,Bengalure
f,0.741893,0.035252,0.858849,0.198183,74.189342,1.600742,Hyderabad
g,0.948503,0.528745,0.373274,0.807166,94.850276,1.321776,Pune
h,0.868301,0.984955,0.696381,0.22898,86.830078,1.564682,Ahmedabad
i,0.0,0.864338,0.458358,0.545292,9.637759,0.554736,Indore


In [80]:
cities_new = ['Mumbai', 'Delhi', 'Chennai',
              'Kolkata', 'Bengalure', 'Hyderabad',
              'Pune', 'Ahmedabad', 'Guwahati']

In [81]:
df_copy['new_city'] = cities_new

In [82]:
df_copy

Unnamed: 0,A,B,C,D,E,F,city,new_city
a,0.939647,0.175265,0.188185,0.475189,93.964698,1.127832,Mumbai,Mumbai
b,0.0,0.087344,0.285504,0.937199,1.13804,0.296885,Delhi,Delhi
c,0.0,0.099863,0.622481,0.453287,10.088008,0.723361,Chennai,Chennai
d,0.227133,0.815822,0.013137,0.384672,22.71335,0.24027,Kolkata,Kolkata
e,0.482952,0.05489,0.219516,0.943858,48.295222,0.702468,Bengalure,Bengalure
f,0.741893,0.035252,0.858849,0.198183,74.189342,1.600742,Hyderabad,Hyderabad
g,0.948503,0.528745,0.373274,0.807166,94.850276,1.321776,Pune,Pune
h,0.868301,0.984955,0.696381,0.22898,86.830078,1.564682,Ahmedabad,Ahmedabad
i,0.0,0.864338,0.458358,0.545292,9.637759,0.554736,Indore,Guwahati


In [83]:
criteria = df_copy['city'].isin(['Pune', 'Bengaluru', 'Hyderabad'])

In [84]:
df_copy.loc[df.city == 'Bengalure', ['city', 'new_city']] = 'Bengaluru'

In [85]:
df_copy

Unnamed: 0,A,B,C,D,E,F,city,new_city
a,0.939647,0.175265,0.188185,0.475189,93.964698,1.127832,Mumbai,Mumbai
b,0.0,0.087344,0.285504,0.937199,1.13804,0.296885,Delhi,Delhi
c,0.0,0.099863,0.622481,0.453287,10.088008,0.723361,Chennai,Chennai
d,0.227133,0.815822,0.013137,0.384672,22.71335,0.24027,Kolkata,Kolkata
e,0.482952,0.05489,0.219516,0.943858,48.295222,0.702468,Bengaluru,Bengaluru
f,0.741893,0.035252,0.858849,0.198183,74.189342,1.600742,Hyderabad,Hyderabad
g,0.948503,0.528745,0.373274,0.807166,94.850276,1.321776,Pune,Pune
h,0.868301,0.984955,0.696381,0.22898,86.830078,1.564682,Ahmedabad,Ahmedabad
i,0.0,0.864338,0.458358,0.545292,9.637759,0.554736,Indore,Guwahati


In [86]:
?df_copy.drop

[31mSignature:[39m
df_copy.drop(
    labels: [33m'IndexLabel | None'[39m = [38;5;28;01mNone[39;00m,
    *,
    axis: [33m'Axis'[39m = [32m0[39m,
    index: [33m'IndexLabel | None'[39m = [38;5;28;01mNone[39;00m,
    columns: [33m'IndexLabel | None'[39m = [38;5;28;01mNone[39;00m,
    level: [33m'Level | None'[39m = [38;5;28;01mNone[39;00m,
    inplace: [33m'bool'[39m = [38;5;28;01mFalse[39;00m,
    errors: [33m'IgnoreRaise'[39m = [33m'raise'[39m,
) -> [33m'DataFrame | None'[39m
[31mDocstring:[39m
Drop specified labels from rows or columns.

Remove rows or columns by specifying label names and corresponding
axis, or by directly specifying index or column names. When using a
multi-index, labels on different levels can be removed by specifying
the level. See the :ref:`user guide <advanced.shown_levels>`
for more information about the now unused levels.

Parameters
----------
labels : single label or list-like
    Index or column labels to drop. A tuple will

In [88]:
df_copy.drop(['new_cities'], axis=1)

KeyError: "['new_cities'] not found in axis"

In [89]:
criteria

a    False
b    False
c    False
d    False
e    False
f     True
g     True
h    False
i    False
Name: city, dtype: bool

In [90]:
?df_copy.sample

[31mSignature:[39m
df_copy.sample(
    n: [33m'int | None'[39m = [38;5;28;01mNone[39;00m,
    frac: [33m'float | None'[39m = [38;5;28;01mNone[39;00m,
    replace: [33m'bool_t'[39m = [38;5;28;01mFalse[39;00m,
    weights=[38;5;28;01mNone[39;00m,
    random_state: [33m'RandomState | None'[39m = [38;5;28;01mNone[39;00m,
    axis: [33m'Axis | None'[39m = [38;5;28;01mNone[39;00m,
    ignore_index: [33m'bool_t'[39m = [38;5;28;01mFalse[39;00m,
) -> [33m'Self'[39m
[31mDocstring:[39m
Return a random sample of items from an axis of object.

You can use `random_state` for reproducibility.

Parameters
----------
n : int, optional
    Number of items from axis to return. Cannot be used with `frac`.
    Default = 1 if `frac` = None.
frac : float, optional
    Fraction of axis items to return. Cannot be used with `n`.
replace : bool, default False
    Allow or disallow sampling of the same row more than once.
weights : str or ndarray-like, optional
    Default 'None' r

In [91]:
df_copy.sample(3)

Unnamed: 0,A,B,C,D,E,F,city,new_city
c,0.0,0.099863,0.622481,0.453287,10.088008,0.723361,Chennai,Chennai
e,0.482952,0.05489,0.219516,0.943858,48.295222,0.702468,Bengaluru,Bengaluru
h,0.868301,0.984955,0.696381,0.22898,86.830078,1.564682,Ahmedabad,Ahmedabad


In [92]:
df_copy.sample(3, random_state=42)

Unnamed: 0,A,B,C,D,E,F,city,new_city
h,0.868301,0.984955,0.696381,0.22898,86.830078,1.564682,Ahmedabad,Ahmedabad
b,0.0,0.087344,0.285504,0.937199,1.13804,0.296885,Delhi,Delhi
f,0.741893,0.035252,0.858849,0.198183,74.189342,1.600742,Hyderabad,Hyderabad


In [93]:
df_copy.sample(3, random_state=42, replace=True)

Unnamed: 0,A,B,C,D,E,F,city,new_city
g,0.948503,0.528745,0.373274,0.807166,94.850276,1.321776,Pune,Pune
d,0.227133,0.815822,0.013137,0.384672,22.71335,0.24027,Kolkata,Kolkata
h,0.868301,0.984955,0.696381,0.22898,86.830078,1.564682,Ahmedabad,Ahmedabad


In [94]:
import pandas as pd

cities = ['Mumbai', 'Chennai', 'Pune', 'Ahmedabad', 'Kolkata', 'Kanpur', 'Delhi']
city_df = pd.DataFrame(cities)

In [95]:
city_df.columns = ['City_Name']

In [97]:
city_df

Unnamed: 0,City_Name
0,Mumbai
1,Chennai
2,Pune
3,Ahmedabad
4,Kolkata
5,Kanpur
6,Delhi


In [98]:
condition_met = city_df.City_Name == 'Mumbai'

In [99]:
city_df[condition_met]

Unnamed: 0,City_Name
0,Mumbai


In [100]:
city_df[city_df.City_Name == 'Pune']

Unnamed: 0,City_Name
2,Pune


In [102]:
# Aggregation and grouping

import numpy as np

In [103]:
random_state = np.random.RandomState(100)
random_series = pd.Series(random_state.rand(10))

In [104]:
random_series

0    0.543405
1    0.278369
2    0.424518
3    0.844776
4    0.004719
5    0.121569
6    0.670749
7    0.825853
8    0.136707
9    0.575093
dtype: float64

In [105]:
random_series.mean()

np.float64(0.4425757785871915)

In [106]:
random_series.std()

np.float64(0.2988992029497061)

In [107]:
random_series.sum()

np.float64(4.425757785871915)

In [108]:
df = pd.DataFrame({'A': random_state.rand(5),
                   'B': random_state.rand(5)})
df

Unnamed: 0,A,B
0,0.891322,0.978624
1,0.209202,0.811683
2,0.185328,0.171941
3,0.108377,0.816225
4,0.219697,0.274074


In [109]:
df.sum()

A    1.613927
B    3.052546
dtype: float64

In [110]:
df.mean()

A    0.322785
B    0.610509
dtype: float64

In [111]:
df.mean(axis=1)

0    0.934973
1    0.510443
2    0.178635
3    0.462301
4    0.246886
dtype: float64

In [None]:
# Groupby

Three stages
* Split - we split dataframe into multiple smaller dataframe based on the values of keys
* Apply - we apply desired aggregation/transformation on each dataframe.
* Combine - we combine results from apply state into a dataframe

In [113]:
df = pd.DataFrame({'key': ['A','B','C']*2, #list("ABCABC"), ['A','B','C','A','B','C']
                   'data': range(6)})

In [114]:
df

Unnamed: 0,key,data
0,A,0
1,B,1
2,C,2
3,A,3
4,B,4
5,C,5


In [115]:
df.groupby("key")

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x00000141FA1A7170>

In [116]:
df.groupby("key").sum()

Unnamed: 0_level_0,data
key,Unnamed: 1_level_1
A,3
B,5
C,7
