# Creating series through real world datasets

In [1]:
import numpy as np

In [2]:
import pandas as pd

In [3]:
pd.read_csv('subs.csv') # by default it reads data and store in dataframe format

Unnamed: 0,Subscribers gained
0,48
1,57
2,40
3,43
4,44
...,...
360,231
361,226
362,155
363,144


In [4]:
type(pd.read_csv('subs.csv'))

pandas.core.frame.DataFrame

In [5]:
subs_ser = pd.read_csv('subs.csv').squeeze() # convert from dataframe to series
subs_ser

0       48
1       57
2       40
3       43
4       44
      ... 
360    231
361    226
362    155
363    144
364    172
Name: Subscribers gained, Length: 365, dtype: int64

In [6]:
type(subs_ser)

pandas.core.series.Series

In [7]:
kholi_ser = pd.read_csv('kohli_ipl.csv',index_col='match_no').squeeze() # 2 cols in the dataset
kholi_ser

match_no
1       1
2      23
3      13
4      12
5       1
       ..
211     0
212    20
213    73
214    25
215     7
Name: runs, Length: 215, dtype: int64

In [8]:
type(kholi_ser)

pandas.core.series.Series

In [9]:
bollywood_ser = pd.read_csv('bollywood.csv',index_col='movie').squeeze() # 2 cols in the dataset
bollywood_ser

movie
Uri: The Surgical Strike                   Vicky Kaushal
Battalion 609                                Vicky Ahuja
The Accidental Prime Minister (film)         Anupam Kher
Why Cheat India                            Emraan Hashmi
Evening Shadows                         Mona Ambegaonkar
                                              ...       
Hum Tumhare Hain Sanam                    Shah Rukh Khan
Aankhen (2002 film)                     Amitabh Bachchan
Saathiya (film)                             Vivek Oberoi
Company (film)                                Ajay Devgn
Awara Paagal Deewana                        Akshay Kumar
Name: lead, Length: 1500, dtype: object

# Series Methods
- head()
- tail()
- sample()
- value_counts()
- sort_values()
- sort_index()

In [10]:
subs_ser.head() # head() gives the preview of the dataset by default it shows 5 rows

0    48
1    57
2    40
3    43
4    44
Name: Subscribers gained, dtype: int64

In [11]:
subs_ser.head(3)

0    48
1    57
2    40
Name: Subscribers gained, dtype: int64

In [12]:
kholi_ser.tail() # bottom 5 rows by default

match_no
211     0
212    20
213    73
214    25
215     7
Name: runs, dtype: int64

In [13]:
bollywood_ser.sample(3) # random selection

movie
Qarib Qarib Singlle     Irrfan Khan
Inkaar (2013 film)     Arjun Rampal
Kucch Luv Jaisaa       Shefali Shah
Name: lead, dtype: object

In [14]:
# when to use sample?
# when you have some kind of biasness in your dataset and you want to select randomly

In [15]:
bollywood_ser

movie
Uri: The Surgical Strike                   Vicky Kaushal
Battalion 609                                Vicky Ahuja
The Accidental Prime Minister (film)         Anupam Kher
Why Cheat India                            Emraan Hashmi
Evening Shadows                         Mona Ambegaonkar
                                              ...       
Hum Tumhare Hain Sanam                    Shah Rukh Khan
Aankhen (2002 film)                     Amitabh Bachchan
Saathiya (film)                             Vivek Oberoi
Company (film)                                Ajay Devgn
Awara Paagal Deewana                        Akshay Kumar
Name: lead, Length: 1500, dtype: object

In [16]:
# Tell each actor has done how many movies 
bollywood_ser.value_counts() # frequency count of each value in series in asce order

lead
Akshay Kumar        48
Amitabh Bachchan    45
Ajay Devgn          38
Salman Khan         31
Sanjay Dutt         26
                    ..
Diganth              1
Parveen Kaur         1
Seema Azmi           1
Akanksha Puri        1
Edwin Fernandes      1
Name: count, Length: 566, dtype: int64

In [17]:
# sort_values (also can be done in-place)
kholi_ser.sort_values() # asce order by default

match_no
87       0
211      0
207      0
206      0
91       0
      ... 
164    100
120    100
123    108
126    109
128    113
Name: runs, Length: 215, dtype: int64

In [18]:
kholi_ser.sort_values(ascending=False) 

match_no
128    113
126    109
123    108
164    100
120    100
      ... 
93       0
211      0
130      0
8        0
135      0
Name: runs, Length: 215, dtype: int64

In [19]:
# tell me the highest score of VK in IPL
kholi_ser.sort_values(ascending=False).head(1).values[0] # method chaining 

113

In [20]:
# sort_index
bollywood_ser.sort_index() # if asked to sort movies alphabetically 

movie
1920 (film)                   Rajniesh Duggall
1920: London                     Sharman Joshi
1920: The Evil Returns             Vicky Ahuja
1971 (2007 film)                Manoj Bajpayee
2 States (2014 film)              Arjun Kapoor
                                   ...        
Zindagi 50-50                      Veena Malik
Zindagi Na Milegi Dobara        Hrithik Roshan
Zindagi Tere Naam           Mithun Chakraborty
Zokkomon                       Darsheel Safary
Zor Lagaa Ke...Haiya!            Meghan Jadhav
Name: lead, Length: 1500, dtype: object

# Series Math Methods
- count
- sum, product
- mean, median, mode, std, var
- min/max
- describe

In [21]:
kholi_ser.count() # how many rows in dataset

215

In [22]:
kholi_ser.size

215

In [23]:
# how count is diff from size
# count -> count all non-missing values in a series
# size -> count all values (non-missing(i.e NAN) + missing) in a series

In [24]:
subs_ser.sum() # sum all the values

49510

In [25]:
subs_ser.product()

0

In [26]:
subs_ser.mean()

135.64383561643837

In [27]:
kholi_ser.median()

24.0

In [28]:
bollywood_ser.mode() # who have done most movies i.e most occuring value 

0    Akshay Kumar
Name: lead, dtype: object

In [29]:
subs_ser.std()

62.67502303725269

In [30]:
kholi_ser.var()

688.0024777222344

In [31]:
subs_ser.max()

396

In [32]:
subs_ser.min()

33

In [33]:
kholi_ser.describe() # gives you a summary

count    215.000000
mean      30.855814
std       26.229801
min        0.000000
25%        9.000000
50%       24.000000
75%       48.000000
max      113.000000
Name: runs, dtype: float64