In [39]:
import pandas as pd
import numpy as np
%matplotlib inline

### Series Object

A series object in pandas is simply a one-dimensional array of indexed data.It means that each element has an index, allowing one to access them using values and the index attributes.

    Syntax: 
        pd.Series(data, index = index)

index argument is optional

In [43]:
s = pd.Series([1,3,4,6,7,12]) # creating a pandas series using a list
s

0     1
1     3
2     4
3     6
4     7
5    12
dtype: int64

In [44]:
# as you can see the result shows a sequence of indices and values.

s.values # getting the values only

array([ 1,  3,  4,  6,  7, 12], dtype=int64)

In [45]:
s.index # accessing the indices

RangeIndex(start=0, stop=6, step=1)

In [46]:
# we can also get the values as follows:
s[0]

1

In [47]:
s[3]

6

In [48]:
s[:5]

0    1
1    3
2    4
3    6
4    7
dtype: int64

### The difference between Pandas Series Object and One-Dimensional array

The only difference between a series object and a Numpy one-dimensional array is the indices.An important feature of the series object is that the indices can take any type of value and can be set up in any way.

In [49]:
s = pd.Series([1,4,5,7,8,11], index = [2,3,8,4,6,7]) # the indices are different in this case
s

2     1
3     4
8     5
4     7
6     8
7    11
dtype: int64

In [50]:
s = pd.Series([2,3,4,5,6,7], index = ['b', 'c', 's', 'd', 'n', 'e']) # the indices are strings
s

b    2
c    3
s    4
d    5
n    6
e    7
dtype: int64

In [51]:
s['b']

2

In [52]:
# Creating Pandas Series Object using a dictionary 

name_age_dict = {'Alex': 22, 'John':17, 'Samantha': 25}

s = pd.Series(name_age_dict)
s

Alex        22
John        17
Samantha    25
dtype: int64

In [53]:
s['Alex'] # Accessing values

22

In [54]:
s['Alex':'Samantha'] # Slicing

Alex        22
John        17
Samantha    25
dtype: int64

### Pandas DataFrame Object

A dateframe object is similar to a Numpy two-dimensional array with row indices and column names.

In [55]:
age = pd.Series([20, 23, 35, 54])
name = pd.Series(['Smith', 'Robin', 'Alain', 'Pierre'])

df = pd.DataFrame({'Age':age,'Name':name})
df

Unnamed: 0,Age,Name
0,20,Smith
1,23,Robin
2,35,Alain
3,54,Pierre


In [56]:
df.index

RangeIndex(start=0, stop=4, step=1)

In [57]:
quizzes = {'Smith': 65,
'Robin': 82,
'Alain': 48,
'Pierre': 95,
'John': 77}


In [58]:
# example from the book python for data science handbook

Final_exam = {'Smith': 89, 'Robin': 69, 'Alain': 97,
'Pierre': 70, 'John': 64}

In [59]:
student_stats = pd.DataFrame({'Quizzes': quizzes,
'Final Exam': Final_exam})

student_stats

Unnamed: 0,Quizzes,Final Exam
Smith,65,89
Robin,82,69
Alain,48,97
Pierre,95,70
John,77,64


In [60]:
student_stats.index

Index(['Smith', 'Robin', 'Alain', 'Pierre', 'John'], dtype='object')

In [61]:
student_stats['Quizzes']  # Calling a column name will return a Series of column data

Smith     65
Robin     82
Alain     48
Pierre    95
John      77
Name: Quizzes, dtype: int64

### Creating DataFrame

In [62]:
Q = pd.Series(quizzes) # A Series Object
Q

Smith     65
Robin     82
Alain     48
Pierre    95
John      77
dtype: int64

In [63]:
type(Q)

pandas.core.series.Series

In [64]:
pd.DataFrame(Q) # Creating a DataFrame from a Series Object

Unnamed: 0,0
Smith,65
Robin,82
Alain,48
Pierre,95
John,77


In [65]:
pd.DataFrame(Q, columns=['Quizzes']) # adding a column name

Unnamed: 0,Quizzes
Smith,65
Robin,82
Alain,48
Pierre,95
John,77


In [66]:
final_exam = pd.Series(Final_exam)
type(final_exam)

pandas.core.series.Series

In [67]:
pd.DataFrame(final_exam, columns=['Final Exam']) # creating another dataframe from another series object

Unnamed: 0,Final Exam
Smith,89
Robin,69
Alain,97
Pierre,70
John,64


In [68]:
# Creating a dataframe from a dictionary of series object

pd.DataFrame({'Quizzes': quizzes, 'Final Exam': final_exam})

Unnamed: 0,Quizzes,Final Exam
Smith,65,89
Robin,82,69
Alain,48,97
Pierre,95,70
John,77,64


In [69]:
# Creating a DataFrame from a 2-D Numpy array

pd.DataFrame(np.random.rand(2,2), columns=['name','age'], index = ['@','!'])

Unnamed: 0,name,age
@,0.20131,0.198742
!,0.208055,0.82945


### Data Selection in Series 

In [78]:
df = pd.read_csv("kaggle_survey_2021_responses.csv", low_memory = False)

In [80]:
df.head(3)

Unnamed: 0,Time from Start to Finish (seconds),Q1,Q2,Q3,Q4,Q5,Q6,Q7_Part_1,Q7_Part_2,Q7_Part_3,...,Q38_B_Part_3,Q38_B_Part_4,Q38_B_Part_5,Q38_B_Part_6,Q38_B_Part_7,Q38_B_Part_8,Q38_B_Part_9,Q38_B_Part_10,Q38_B_Part_11,Q38_B_OTHER
0,Duration (in seconds),What is your age (# years)?,What is your gender? - Selected Choice,In which country do you currently reside?,What is the highest level of formal education ...,Select the title most similar to your current ...,For how many years have you been writing code ...,What programming languages do you use on a reg...,What programming languages do you use on a reg...,What programming languages do you use on a reg...,...,"In the next 2 years, do you hope to become mor...","In the next 2 years, do you hope to become mor...","In the next 2 years, do you hope to become mor...","In the next 2 years, do you hope to become mor...","In the next 2 years, do you hope to become mor...","In the next 2 years, do you hope to become mor...","In the next 2 years, do you hope to become mor...","In the next 2 years, do you hope to become mor...","In the next 2 years, do you hope to become mor...","In the next 2 years, do you hope to become mor..."
1,910,50-54,Man,India,Bachelor’s degree,Other,5-10 years,Python,R,,...,,,,,,,,,,
2,784,50-54,Man,Indonesia,Master’s degree,Program/Project Manager,20+ years,,,SQL,...,,,,,,,,,,


In [81]:
# Lets work with a dataset df that was created earlier
# any single column of a dataframe is a series object. Lets prove that

type(df['Q1']) # second column of df

pandas.core.series.Series

In [82]:
Series_object = df['Q1'] 

In [83]:
Series_object[0] # Selecting the value of index 0

'What is your age (# years)?'

In [84]:
Series_object[1]

'50-54'

In [85]:
'a' in Series_object # is 'a' an index of Series_object

False

In [86]:
2 in Series_object # is 2 an index of Series_object?

True

In [87]:
Series_object.keys() # selecting the indices of Series_object

RangeIndex(start=0, stop=25974, step=1)

In [88]:
# changing the content of a Series object

Series_object[5,8] = 9 # changing the values for index 5 and 8 to 9
Series_object.head(10)

0    What is your age (# years)?
1                          50-54
2                          50-54
3                          22-24
4                          45-49
5                              9
6                          25-29
7                          18-21
8                              9
9                          22-24
Name: Q1, dtype: object

### Slicing Pandas Series


In [89]:
Series_object[0:4] # Selecting from index 0 to index 5

0    What is your age (# years)?
1                          50-54
2                          50-54
3                          22-24
Name: Q1, dtype: object

In [90]:
Series_object[[0,5]] # Selecting index 0 and 5

0    What is your age (# years)?
5                              9
Name: Q1, dtype: object

In [91]:
Series_object[(Series_object == '50-54')].head() # selecting ages between 50-54

1      50-54
2      50-54
17     50-54
77     50-54
161    50-54
Name: Q1, dtype: object

In [107]:
# things are a bit different when the indices are not increment by 1
serie = pd.Series(['a', 'b', 'c'], index=[2, 4, 6])
serie

2    a
4    b
6    c
dtype: object

In [108]:
serie[4] # explicit index when indexing, your way of indexing

'b'

In [109]:
serie[6]

'c'

In [115]:
serie[1:3] # implicit index when indexing uses the python-style index, python way of slicing 

4    b
6    c
dtype: object

In [123]:
# what if you want one select a and c?
serie[[0,2]] # this will raise an error because your indices are not incremented by 1

In [121]:
serie[[2,6]] # you need to type the exact indices to get the values

2    a
6    c
dtype: object

### Loc and iloc

In [None]:
# We can use Loc whenever we want to have access a column or row based on their labels 
# We can use iloc which is related to integer location of rows and columns

### Selecting with loc and iloc

In [125]:
serie.iloc[[0,2]] # method one using the index location

2    a
6    c
dtype: object

In [126]:
serie.loc[[2,6]] # method two using the labels

2    a
6    c
dtype: object

### Slicing with loc and iloc

In [130]:
serie.loc[2:6]

2    a
4    b
6    c
dtype: object

In [132]:
serie.iloc[0:3]

2    a
4    b
6    c
dtype: object

### Data Selection in DataFrame

In [165]:
dataframe = pd.DataFrame({'Quizzes': quizzes, 'Final_Exam':final_exam})
dataframe

Unnamed: 0,Quizzes,Final_Exam
Smith,65,89
Robin,82,69
Alain,48,97
Pierre,95,70
John,77,64


In [166]:
dataframe['Quizzes']

Smith     65
Robin     82
Alain     48
Pierre    95
John      77
Name: Quizzes, dtype: int64

In [167]:
dataframe.Quizzes

Smith     65
Robin     82
Alain     48
Pierre    95
John      77
Name: Quizzes, dtype: int64

In [169]:
dataframe['Average'] = (dataframe['Quizzes'] + dataframe['Final_Exam'])/2 # adding a new column

In [170]:
dataframe

Unnamed: 0,Quizzes,Final_Exam,Average
Smith,65,89,77.0
Robin,82,69,75.5
Alain,48,97,72.5
Pierre,95,70,82.5
John,77,64,70.5


In [171]:
# getting the values of a column

dataframe['Quizzes'].values

array([65, 82, 48, 95, 77], dtype=int64)

In [172]:
dataframe['Average'].values

array([77. , 75.5, 72.5, 82.5, 70.5])

In [173]:
# to transpose or transforming the rows into columns 
dataframe.T

Unnamed: 0,Smith,Robin,Alain,Pierre,John
Quizzes,65.0,82.0,48.0,95.0,77.0
Final_Exam,89.0,69.0,97.0,70.0,64.0
Average,77.0,75.5,72.5,82.5,70.5


In [174]:
dataframe.values[0] # getting the values of first row, which happens to be Smith

array([65., 89., 77.])

In [175]:
dataframe

Unnamed: 0,Quizzes,Final_Exam,Average
Smith,65,89,77.0
Robin,82,69,75.5
Alain,48,97,72.5
Pierre,95,70,82.5
John,77,64,70.5


### Selecting Data

In [176]:
dataframe.loc['Smith']

Quizzes       65.0
Final_Exam    89.0
Average       77.0
Name: Smith, dtype: float64

In [177]:
dataframe.iloc[0]

Quizzes       65.0
Final_Exam    89.0
Average       77.0
Name: Smith, dtype: float64

### Slicing Data

In [178]:
dataframe.loc[:,:] # all rows and columns

Unnamed: 0,Quizzes,Final_Exam,Average
Smith,65,89,77.0
Robin,82,69,75.5
Alain,48,97,72.5
Pierre,95,70,82.5
John,77,64,70.5


In [179]:
dataframe.loc[:'Alain',] # first three rows

Unnamed: 0,Quizzes,Final_Exam,Average
Smith,65,89,77.0
Robin,82,69,75.5
Alain,48,97,72.5


In [180]:
dataframe.loc[:'Robin', :'Final Exam']

Unnamed: 0,Quizzes,Final_Exam
Smith,65,89
Robin,82,69


In [181]:
dataframe.iloc[:2,:]

Unnamed: 0,Quizzes,Final_Exam,Average
Smith,65,89,77.0
Robin,82,69,75.5


In [182]:
dataframe.iloc[:, 1:3]

Unnamed: 0,Final_Exam,Average
Smith,89,77.0
Robin,69,75.5
Alain,97,72.5
Pierre,70,82.5
John,64,70.5


In [191]:
dataframe.loc[dataframe.Final_Exam > 75, ['Quizzes']]

Unnamed: 0,Quizzes
Smith,65
Alain,48


In [199]:
dataframe

Unnamed: 0,Quizzes,Final_Exam,Average
Smith,65,89,77.0
Robin,82,69,75.5
Alain,48,97,72.5
Pierre,95,70,82.5
John,77,64,70.5


In [194]:
dataframe.iloc[:3,:2]

Unnamed: 0,Quizzes,Final_Exam
Smith,65,89
Robin,82,69
Alain,48,97


In [200]:
# changing the quizzes mark for Smith

dataframe.loc['Smith', 'Quizzes'] = 74

In [202]:
dataframe.loc['Smith', 'Quizzes']

74