In [50]:
!pip install pandas --upgrade

[0m

In [90]:
import pandas as pd
pd.__version__

'2.1.3'

In [52]:
#pandas data frames = tabular representation of data. rows and columns. 
#series = 1 dimensional array of your data structure. 

In [53]:
list_of_nums = [3,4,7,2,9]
series_of_nums = pd.Series(list_of_nums)
series_of_nums

0    3
1    4
2    7
3    2
4    9
dtype: int64

In [54]:
#first column is the index column, second is the values in the example
series_of_nums[1]

4

In [55]:
#access multiple elements of the series using a mask
index_mask = [0,3,4]
index_mask
series_of_nums[index_mask]

0    3
3    2
4    9
dtype: int64

In [56]:
boolean_mask = [True, False, False, True, False]
series_of_nums[boolean_mask]

0    3
3    2
dtype: int64

In [57]:
#value of true implies you want the value returned.
series_of_nums.index

RangeIndex(start=0, stop=5, step=1)

In [58]:
#pandas use integers as the default index type. change the series to something else by setting the pandas index value
series_of_nums.index=['a','b','c','d', 'e']
series_of_nums

a    3
b    4
c    7
d    2
e    9
dtype: int64

In [59]:
#return a single value by index key
series_of_nums['b']

4

In [60]:
#want to return multiple values?
series_of_nums[['b','c']]

b    4
c    7
dtype: int64

In [61]:
#working with dictionaries
dict_of_data= {'a': 3,
               'b': 4,
               'c': 7,
               'd': 2,
               'e': 9}
series_from_dict = pd.Series(dict_of_data)
series_from_dict

a    3
b    4
c    7
d    2
e    9
dtype: int64

In [62]:
#the keys become the indexes and values become elements of the series.
#.name variable 
series_from_dict.name = "Series from dictionary"
series_from_dict

a    3
b    4
c    7
d    2
e    9
Name: Series from dictionary, dtype: int64

In [63]:
###### SERIES OPERATIONS ########

#working with multiple series
series_one = pd.Series ([1,2,3,4,5])
series_two = pd.Series ([1,3,5,7,9])
#1,3 and 5 are in both datasets

In [64]:
#not always possible to see what the data looks like or evaluate everything. methods can help evaluate the data
#series.isin(value to compare) - returns a series along with data type
series_one.isin(series_two)

0     True
1    False
2     True
3    False
4     True
dtype: bool

In [66]:
#result is a boolean mask, like we worked with earlier. can use the bolean mask to index into series one.
series_one[series_one.isin(series_two)] #only return values in series one that exist in series_two

0    1
2    3
4    5
dtype: int64

In [68]:
#reverse - find items in one array that aren't in the other. 
#Use a tilde. false values become true, true becomes false.
series_one[~series_one.isin(series_two)]

1    2
3    4
dtype: int64

In [69]:
#specifying transformations.
#lambda functions work against every item in the series
series_squares = series_one.map(lambda x: x*x)
series_squares

0     1
1     4
2     9
3    16
4    25
dtype: int64

In [71]:
#have two series objects? add the elements of two series using .add to the series object.
series_one.add(series_two)

0     2
1     5
2     8
3    11
4    14
dtype: int64

In [73]:
#changing data types using .astype
#series_one is an int64
series_one

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [74]:
series_one.astype('float64')

0    1.0
1    2.0
2    3.0
3    4.0
4    5.0
dtype: float64

In [84]:
######### APPENDING AND SORTING SERIES #################

In [82]:
#append series one and series two
appended_series = series_one.concat(series_two)
appended_series
#this doesn't work. had to use pd.concat([series_one, series_two])
#pd.concat([series_one, series_two])

AttributeError: 'Series' object has no attribute 'concat'

In [86]:
s1 = pd.Series(['a','b'])
s2 = pd.Series(['c', 'd'])
s3 = pd.concat([s1, s2])
s3

0    a
1    b
0    c
1    d
dtype: object

In [92]:
#works!!, but there are dupes in the index
pd.concat([series_one, series_two])

0    1
1    2
2    3
3    4
4    5
0    1
1    3
2    5
3    7
4    9
dtype: int64

In [94]:
#ignore_index = ignore original index labels for new appendix; create a new one.
pd.concat([series_one, series_two], ignore_index=True)

0    1
1    2
2    3
3    4
4    5
5    1
6    3
7    5
8    7
9    9
dtype: int64

In [101]:
#drop items from index
#appended_series.drop(8) to drop a single item
#appended_series.drop(labels = [5,7])
appended_series = pd.concat([series_one, series_two], ignore_index=True)

appended_series
                        

0    1
1    2
2    3
3    4
4    5
5    1
6    3
7    5
8    7
9    9
dtype: int64

In [102]:
#remove items with specific names
appended_series.drop(labels = [5,7])

0    1
1    2
2    3
3    4
4    5
6    3
8    7
9    9
dtype: int64

In [104]:
appended_series.count()

10

In [106]:
#how many unique values?
appended_series.nunique()

7

In [108]:
#return values that are unique
appended_series.unique()

array([1, 2, 3, 4, 5, 7, 9])

In [111]:
#sorting
appended_series.sort_values()

0    1
5    1
1    2
2    3
6    3
3    4
4    5
7    5
8    7
9    9
dtype: int64

In [113]:
appended_series

0    1
1    2
2    3
3    4
4    5
5    1
6    3
7    5
8    7
9    9
dtype: int64

In [116]:
#how to append the series without creating new series object
appended_series.sort_values(inplace=True)
appended_series

0    1
5    1
1    2
2    3
6    3
3    4
4    5
7    5
8    7
9    9
dtype: int64

In [119]:
#but when you do that, the index labels are no longer sorted. each item has index that relates to original
appended_series.sort_values(inplace=True)
appended_series.reset_index()
#returns a dataframe with 2 columns. contains one column named index and another named 0
#index values go from 0 to 9
#preserves original index and creates a new index starting with 0.

Unnamed: 0,index,0
0,0,1
1,5,1
2,1,2
3,2,3
4,6,3
5,3,4
6,4,5
7,7,5
8,8,7
9,9,9


In [121]:
#want to drop index and recreate? add input argument drop=True
appended_series.reset_index(drop=True)

0    1
1    1
2    2
3    3
4    3
5    4
6    5
7    5
8    7
9    9
dtype: int64