In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

# Reindexing

* An important method on pandas objects is reindex, which means to create a new
object with the values rearranged to align with the new index.

In [2]:
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=["d", "b", "a", "c"])

In [3]:
obj

Unnamed: 0,0
d,4.5
b,7.2
a,-5.3
c,3.6


In [5]:
obj2 = obj.reindex(["a","b","c","d"])

In [6]:
obj2

Unnamed: 0,0
a,-5.3
b,7.2
c,3.6
d,4.5


**For ordered data like time series, you may want to do some interpolation or filling of
values when reindexing. The method option allows us to do this, using a method such
as ffill, which forward-fills the values:**

In [7]:
obj3 = pd.Series(["blue","red","yellow"], index = [0,2,4])

In [8]:
obj3

Unnamed: 0,0
0,blue
2,red
4,yellow


In [9]:
obj3.reindex(np.arange(6), method  = "ffill")

Unnamed: 0,0
0,blue
1,blue
2,red
3,red
4,yellow
5,yellow


In [10]:
obj3.reindex(np.arange(8), method  = "ffill")

Unnamed: 0,0
0,blue
1,blue
2,red
3,red
4,yellow
5,yellow
6,yellow
7,yellow


In [13]:
obj3.reindex(np.arange(5), method  = "ffill")

Unnamed: 0,0
0,blue
1,blue
2,red
3,red
4,yellow


**With DataFrame, reindex can alter the (row) index, columns, or both. When passed
only a sequence, it reindexes the rows in the result:**

In [14]:
frame  = pd.DataFrame(np.arange(9).reshape((3,3)), index = ["a","c","d"], columns=["ohio","texas","california"])

In [15]:
frame

Unnamed: 0,ohio,texas,california
a,0,1,2
c,3,4,5
d,6,7,8


In [26]:
frame2 = frame.reindex(index  = ["a","b","c","d"], columns=["texas","ohio","california","florida"])

In [27]:
frame2

Unnamed: 0,texas,ohio,california,florida
a,1.0,0.0,2.0,
b,,,,
c,4.0,3.0,5.0,
d,7.0,6.0,8.0,


**Another way to reindex a particular axis is to pass the new axis labels as a positional
argument and then specify the axis to reindex with the axis keyword:**

In [57]:
states = ["texas","ohio","california"]
frame2.reindex(states, axis="columns")

Unnamed: 0,texas,ohio,california
a,1.0,0.0,2.0
b,,,
c,4.0,3.0,5.0
d,7.0,6.0,8.0


# Dropping Entries from an Axis.

* Dropping one or more entries from an axis is simple if you already have an index
array or list without those entries, since you can use the reindex method or .locbased indexing. As that can require a bit of munging and set logic, the drop method
will return a new object with the indicated value or values deleted from an axis:

In [28]:
obj = pd.Series(np.arange(5.), index=["a", "b", "c", "d", "e"])

In [30]:
obj

Unnamed: 0,0
a,0.0
b,1.0
c,2.0
d,3.0
e,4.0


In [31]:
new_obj = obj.drop("d")

In [32]:
new_obj

Unnamed: 0,0
a,0.0
b,1.0
c,2.0
e,4.0


In [33]:
obj

Unnamed: 0,0
a,0.0
b,1.0
c,2.0
d,3.0
e,4.0


In [34]:
obj.drop(["d", "c"])

Unnamed: 0,0
a,0.0
b,1.0
e,4.0


In [35]:
obj

Unnamed: 0,0
a,0.0
b,1.0
c,2.0
d,3.0
e,4.0


**With DataFrame, index values can be deleted from either axis. To illustrate this, we
first create an example DataFrame:**

In [38]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)), index=["Ohio", "Colorado", "Utah", "New York"],columns=["one", "two", "three", "four"])

In [39]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [40]:
data.drop(index = ["Ohio"], columns = ["two"])

Unnamed: 0,one,three,four
Colorado,4,6,7
Utah,8,10,11
New York,12,14,15


**You can also drop values from the columns by passing axis=1 (which is like NumPy)
or axis="columns":**

In [41]:
data.drop("two", axis = 1)

Unnamed: 0,one,three,four
Ohio,0,2,3
Colorado,4,6,7
Utah,8,10,11
New York,12,14,15


# Indexing, Selection, and Filtering.

In [42]:
obj = pd.Series(np.arange(4.), index=["a", "b", "c", "d"])

In [43]:
obj

Unnamed: 0,0
a,0.0
b,1.0
c,2.0
d,3.0


In [44]:
obj["b"]

np.float64(1.0)

In [47]:
obj[2:4]

Unnamed: 0,0
c,2.0
d,3.0


In [48]:
obj[["b","d","d"]]

Unnamed: 0,0
b,1.0
d,3.0
d,3.0


In [50]:
obj[[1,3,2]]

  obj[[1,3,2]]


Unnamed: 0,0
b,1.0
d,3.0
c,2.0


In [51]:
obj[obj<2]

Unnamed: 0,0
a,0.0
b,1.0


**Indexing into a DataFrame retrieves one or more columns either with a single value
or sequence:**

In [52]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),index=["Ohio", "Colorado", "Utah", "New York"], columns=["one", "two", "three", "four"])

In [53]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [58]:
data["two"]

Unnamed: 0,two
Ohio,1
Colorado,5
Utah,9
New York,13


In [60]:
data[["three","one"]]

Unnamed: 0,three,one
Ohio,2,0
Colorado,6,4
Utah,10,8
New York,14,12


In [62]:
data[:1]

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3


In [63]:
data[data["three"]>4]

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [64]:
data < 5

Unnamed: 0,one,two,three,four
Ohio,True,True,True,True
Colorado,True,False,False,False
Utah,False,False,False,False
New York,False,False,False,False


In [65]:
data[data < 5] = 0

# Selection on DataFrame with loc and iloc.

* Like Series, DataFrame has special attributes loc and iloc for label-based and
integer-based indexing, respectively. Since DataFrame is two-dimensional, you can
select a subset of the rows and columns with NumPy-like notation using either axis
labels (loc) or integers (iloc).


In [66]:
data

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [68]:
data.loc[["Colorado","Utah"]]

Unnamed: 0,one,two,three,four
Colorado,0,5,6,7
Utah,8,9,10,11


In [69]:
data.loc["Utah", ["two","three"]]

Unnamed: 0,Utah
two,9
three,10


In [70]:
data.iloc[2]

Unnamed: 0,Utah
one,8
two,9
three,10
four,11


In [71]:
data.iloc[[2,1]]

Unnamed: 0,one,two,three,four
Utah,8,9,10,11
Colorado,0,5,6,7


In [72]:
data.iloc[2,[1,2]]

Unnamed: 0,Utah
two,9
three,10


In [73]:
data.iloc[[1, 2], [3, 0, 1]]

Unnamed: 0,four,one,two
Colorado,7,0,5
Utah,11,8,9


**Both indexing functions work with slices in addition to single labels or lists of labels:**

In [76]:
data.loc[:"Utah",:"two"]

Unnamed: 0,one,two
Ohio,0,0
Colorado,0,5
Utah,8,9


In [77]:
data.iloc[:, :3][data.three > 5]

Unnamed: 0,one,two,three
Colorado,0,5,6
Utah,8,9,10
New York,12,13,14


# Arithmetic and Data Alignment

In [78]:
s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index=["a", "c", "d", "e"])
s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1],index=["a", "c", "e", "f", "g"])

In [79]:
s1

Unnamed: 0,0
a,7.3
c,-2.5
d,3.4
e,1.5


In [80]:
s2

Unnamed: 0,0
a,-2.1
c,3.6
e,-1.5
f,4.0
g,3.1


In [81]:
s1 + s2

Unnamed: 0,0
a,5.2
c,1.1
d,
e,0.0
f,
g,


In [82]:
df1 = pd.DataFrame(np.arange(9.).reshape((3, 3)), columns=list("bcd"),index=["Ohio", "Texas", "Colorado"])


In [83]:
df1

Unnamed: 0,b,c,d
Ohio,0.0,1.0,2.0
Texas,3.0,4.0,5.0
Colorado,6.0,7.0,8.0


In [84]:
df2 = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list("bde"),index=["Utah", "Ohio", "Texas", "Oregon"])

In [85]:
df2

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [86]:
df1 + df2

Unnamed: 0,b,c,d,e
Colorado,,,,
Ohio,3.0,,6.0,
Oregon,,,,
Texas,9.0,,12.0,
Utah,,,,


**If you add DataFrame objects with no column or row labels in common, the result
will contain all nulls:**

In [92]:
df1 = pd.DataFrame({'A': [1,2,3]})

In [93]:
df1

Unnamed: 0,A
0,1
1,2
2,3


In [94]:
df2 = pd.DataFrame({'B': [1,2,3]})

In [95]:
df1 + df2

Unnamed: 0,A,B
0,,
1,,
2,,


In [96]:
df1 = pd.DataFrame(np.arange(12.).reshape((3, 4)), columns=list("abcd"))


In [97]:
df1

Unnamed: 0,a,b,c,d
0,0.0,1.0,2.0,3.0
1,4.0,5.0,6.0,7.0
2,8.0,9.0,10.0,11.0


In [98]:
df2 = pd.DataFrame(np.arange(20.).reshape((4, 5)),columns=list("abcde"))


In [99]:
df2

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,4.0
1,5.0,6.0,7.0,8.0,9.0
2,10.0,11.0,12.0,13.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [103]:
df2.loc[1,"b"] = np.nan

In [104]:
df2

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,4.0
1,5.0,,7.0,8.0,9.0
2,10.0,11.0,12.0,13.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [105]:
df1 + df2

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,
1,9.0,,13.0,15.0,
2,18.0,20.0,22.0,24.0,
3,,,,,


In [106]:
df1.add(df2,fill_value=0)

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,4.0
1,9.0,5.0,13.0,15.0,9.0
2,18.0,20.0,22.0,24.0,14.0
3,15.0,16.0,17.0,18.0,19.0


**Operations between DataFrame and Series**

In [107]:
arr = np.arange(12.).reshape((3,4))

In [108]:
arr

array([[ 0.,  1.,  2.,  3.],
       [ 4.,  5.,  6.,  7.],
       [ 8.,  9., 10., 11.]])

In [109]:
arr[0]

array([0., 1., 2., 3.])

In [110]:
arr - arr[0]

array([[0., 0., 0., 0.],
       [4., 4., 4., 4.],
       [8., 8., 8., 8.]])

In [111]:
frame = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list("bde"), index=["Utah", "Ohio", "Texas", "Oregon"])

In [112]:
frame

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [115]:
series = frame.iloc[0]

In [116]:
series

Unnamed: 0,Utah
b,0.0
d,1.0
e,2.0


In [117]:
frame -series

Unnamed: 0,b,d,e
Utah,0.0,0.0,0.0
Ohio,3.0,3.0,3.0
Texas,6.0,6.0,6.0
Oregon,9.0,9.0,9.0


In [118]:
 series2 = pd.Series(np.arange(3), index=["b", "e", "f"])

In [120]:
series2

Unnamed: 0,0
b,0
e,1
f,2


In [121]:
frame + series2

Unnamed: 0,b,d,e,f
Utah,0.0,,3.0,
Ohio,3.0,,6.0,
Texas,6.0,,9.0,
Oregon,9.0,,12.0,


In [126]:
series3 = frame.loc[:, "d"]

In [127]:
series3

Unnamed: 0,d
Utah,1.0
Ohio,4.0
Texas,7.0
Oregon,10.0


In [128]:
frame.sub(series3, axis = "index")

Unnamed: 0,b,d,e
Utah,-1.0,0.0,1.0
Ohio,-1.0,0.0,1.0
Texas,-1.0,0.0,1.0
Oregon,-1.0,0.0,1.0


# Sorting and Ranking

In [130]:
obj = pd.Series(np.arange(4), index=["d", "a", "b", "c"])

In [131]:
obj

Unnamed: 0,0
d,0
a,1
b,2
c,3


In [132]:
obj.sort_index()

Unnamed: 0,0
a,1
b,2
c,3
d,0


In [133]:
frame = pd.DataFrame(np.arange(8).reshape((2, 4)),index=["three", "one"],columns=["d", "a", "b", "c"])

In [134]:
frame

Unnamed: 0,d,a,b,c
three,0,1,2,3
one,4,5,6,7


In [135]:
frame.sort_index(axis = 0)

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


In [137]:
frame.sort_index(axis =1 )

Unnamed: 0,a,b,c,d
three,1,2,3,0
one,5,6,7,4


In [138]:
frame.sort_index(axis="columns", ascending=False)

Unnamed: 0,d,c,b,a
three,0,3,2,1
one,4,7,6,5


**To sort a Series by its values, use its sort_values method:**

In [139]:
obj.sort_values()

Unnamed: 0,0
d,0
a,1
b,2
c,3


**Any missing values are sorted to the end of the Series by default:**

In [140]:
obj = pd.Series([4, np.nan, 7, np.nan, -3, 2])

In [141]:
obj.sort_values()

Unnamed: 0,0
4,-3.0
5,2.0
0,4.0
2,7.0
1,
3,


**Missing values can be sorted to the start instead by using the na_position option:**

In [142]:
obj.sort_values(na_position="first")

Unnamed: 0,0
1,
3,
4,-3.0
5,2.0
0,4.0
2,7.0


In [143]:
frame = pd.DataFrame({"b": [4, 7, -3, 2], "a": [0, 1, 0, 1]})

In [144]:
frame

Unnamed: 0,b,a
0,4,0
1,7,1
2,-3,0
3,2,1


In [145]:
frame.sort_values("b")

Unnamed: 0,b,a
2,-3,0
3,2,1
0,4,0
1,7,1


In [146]:
frame.sort_values(["a","b"])

Unnamed: 0,b,a
2,-3,0
0,4,0
3,2,1
1,7,1


# Summarizing and Computing Descriptive Statistics.

In [147]:
df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5],[np.nan, np.nan], [0.75, -1.3]],index=["a", "b", "c", "d"],columns=["one", "two"])


In [148]:
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [149]:
df.sum()

Unnamed: 0,0
one,9.25
two,-5.8


In [151]:
df.sum(axis = "columns")

Unnamed: 0,0
a,1.4
b,2.6
c,0.0
d,-0.55


**When an entire row or column contains all NA values, the sum is 0, whereas if any
value is not NA, then the result is NA. This can be disabled with the skipna option, in
which case any NA value in a row or column names the corresponding result NA:**

In [152]:
df.sum(axis  = "index", skipna=False)

Unnamed: 0,0
one,
two,


In [153]:
df.sum(axis = "columns", skipna=False)

Unnamed: 0,0
a,
b,2.6
c,
d,-0.55


**Some aggregations, like mean, require at least one non-NA value to yield a value
result, so here we have:**

In [154]:
df.mean(axis="columns")

Unnamed: 0,0
a,1.4
b,1.3
c,
d,-0.275


In [155]:
df.mean(axis="columns",skipna=False)

Unnamed: 0,0
a,
b,1.3
c,
d,-0.275


In [156]:
df.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


In [162]:
obj = pd.Series(["a","b","c"] * 3)

In [163]:
obj

Unnamed: 0,0
0,a
1,b
2,c
3,a
4,b
5,c
6,a
7,b
8,c


In [164]:
obj.describe()

Unnamed: 0,0
count,9
unique,3
top,a
freq,3


# Correlation and Covariance


In [175]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [179]:
import pickle
file_path = '/content/drive/My Drive/Colab Notebooks/yahoo_price.pkl'
with open(file_path, 'rb') as file:
  price = pd.read_pickle(file)

In [181]:
import pickle
file_path = '/content/drive/My Drive/Colab Notebooks/yahoo_volume.pkl'
with open(file_path, 'rb') as file:
  volume = pd.read_pickle(file)

In [184]:
price.head()

Unnamed: 0_level_0,AAPL,GOOG,IBM,MSFT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-01-04,27.990226,313.062468,113.304536,25.884104
2010-01-05,28.038618,311.683844,111.935822,25.892466
2010-01-06,27.592626,303.826685,111.208683,25.733566
2010-01-07,27.541619,296.753749,110.823732,25.465944
2010-01-08,27.724725,300.709808,111.935822,25.641571


In [183]:
volume.head()

Unnamed: 0_level_0,AAPL,GOOG,IBM,MSFT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-01-04,123432400,3927000,6155300,38409100
2010-01-05,150476200,6031900,6841400,49749600
2010-01-06,138040000,7987100,5605300,58182400
2010-01-07,119282800,12876600,5840600,50559700
2010-01-08,111902700,9483900,4197200,51197400


In [185]:
returns = price.pct_change()

# Unique Values, Value Counts, and Membership.

In [186]:
obj = pd.Series(["c", "a", "d", "a", "a", "b", "b", "c", "c"])

In [188]:
obj.unique()

array(['c', 'a', 'd', 'b'], dtype=object)

In [189]:
obj.value_counts()

Unnamed: 0,count
c,3
a,3
b,2
d,1


In [190]:
obj.isna()

Unnamed: 0,0
0,False
1,False
2,False
3,False
4,False
5,False
6,False
7,False
8,False


In [192]:
mask = obj.isin(["b","c"])

In [193]:
obj[mask]

Unnamed: 0,0
0,c
5,b
6,b
7,c
8,c


In [194]:
data = pd.DataFrame({"Qu1": [1, 3, 4, 3, 4],"Qu2": [2, 3, 1, 2, 3],"Qu3": [1, 5, 2, 4, 4]})

In [199]:
data["Qu1"].value_counts().sort_index()

Unnamed: 0_level_0,count
Qu1,Unnamed: 1_level_1
1,1
3,2
4,2


In [202]:
# There is also a DataFrame.value_counts method, but it computes counts considering
# each row of the DataFrame as a tuple to determine the number of occurrences of each
# distinct row:
data.value_counts()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count
Qu1,Qu2,Qu3,Unnamed: 3_level_1
1,2,1,1
3,2,4,1
3,3,5,1
4,1,2,1
4,3,4,1


In [200]:
data.apply(pd.value_counts).fillna(0)

  data.apply(pd.value_counts).fillna(0)


Unnamed: 0,Qu1,Qu2,Qu3
1,1.0,1.0,1.0
2,0.0,2.0,1.0
3,2.0,2.0,0.0
4,2.0,0.0,2.0
5,0.0,0.0,1.0
