In [None]:
import numpy as np
import pandas as pd

## 1. Intro to NumPy

### 1.1 Basics of NumPy array

<b>Data manipulation in Python is nearly synonymous with NumPy array manipulation</b>

<b>NumPy Array Attributes:</b>

In [None]:
import numpy as np
np.random.seed(0)  # seed for reproducibility

x1 = np.random.randint(10, size=6)  # One-dimensional array
x2 = np.random.randint(10, size=(3, 4))  # Two-dimensional array
x3 = np.random.randint(10, size=(3, 4, 5))  # Three-dimensional array

In [None]:
print("x3 ndim: ", x3.ndim)
print("x3 shape:", x3.shape)
print("x3 size: ", x3.size)

In [None]:
print("dtype:", x3.dtype)

In [None]:
print("itemsize:", x3.itemsize, "bytes")
print("nbytes:", x3.nbytes, "bytes")

In [None]:
print ("Full Array:",x1)
print ("0th element:",x1[0])
print (x1[4])
print (x1[-1])
print (x1[-2])

In [None]:
print (x2)
print (x2[0, 0])
print (x2[2, 0])
print (x2[2, -1])

In [None]:
x2[0, 0] = 12
x2

In [None]:
x1[0] = 3.14159  # this will be truncated!
x1

### Array Slicing: Accessing Subarrays

<b>Just as we can use square brackets to access individual array elements, we can also use them to access subarrays with the slice notation, marked by the colon (:) character.</b>

<b>x[start:stop:step]</b>

In [None]:
x = np.arange(10)
x

In [None]:
print (x[:5])  # first five elements
print (x[5:])  # elements after index 5
print (x[4:7]) # middle sub-array
print (x[::2]) # every other element
print (x[1::2])# every other element, starting at index 1

<b>Multi-dimensional</b>

In [None]:
x2

In [None]:
print (x2[:2, :3]) # two rows, three columns
print (x2[:3, ::2])  #all rows, every other column
print (x2[::-1, ::-1])

<b>Accessing array rows and columns</b>

In [None]:
print(x2[:, 0])  # first column of x2
print(x2[0, :])  # first row of x2
print(x2[0])  # equivalent to x2[0, :]

<b>Subarrays as no-copy views</b>

In [None]:
print(x2)
x2_sub = x2[:2, :2]
print(x2_sub)

In [None]:
x2_sub[0, 0] = 99
print(x2_sub)
print(x2)

<b>Creating copies of arrays</b>

In [None]:
x2_sub_copy = x2[:2, :2].copy()
print(x2_sub_copy)

In [None]:
x2_sub_copy[0, 0] = 42
print(x2_sub_copy)

In [None]:
print(x2)

<b>Reshaping array:</b>

In [None]:
grid = np.arange(1, 10).reshape((3, 3))
print(grid)

<b>Concatenation of arrays:</b>

In [None]:
x = np.array([1, 2, 3])
y = np.array([3, 2, 1])
np.concatenate([x, y])

In [None]:
z = [99, 99, 99]
print(np.concatenate([x, y, z]))

In [None]:
grid = np.array([[1, 2, 3],
                 [4, 5, 6]])

In [None]:
# concatenate along the first axis
np.concatenate([grid, grid])

In [None]:
# concatenate along the second axis (zero-indexed)
np.concatenate([grid, grid], axis=1)

<b>Splitting of array:</b>

In [None]:
x = [1, 2, 3, 99, 99, 3, 2, 1]
x1, x2, x3 = np.split(x, [3, 5])
print(x1, x2, x3)

### Some Useful NumPy Function:

In [None]:
x = np.arange(4)
print (x)

In [None]:
np.add(x, 2)

<b>Operator	Equivalent ufunc	Description</b>
\+	`np.add`	Addition (e.g., 1 + 1 = 2)<br>
\-	`np.subtract`	Subtraction (e.g., 3 - 2 = 1)<br>
\-	`np.negative`	Unary negation (e.g., -2)<br>
\*	`np.multiply`	Multiplication (e.g., 2 * 3 = 6)<br>
/	`np.divide`	Division (e.g., 3 / 2 = 1.5)<br>
//	`np.floor_divide`	Floor division (e.g., 3 // 2 = 1)<br>
\**	`np.power`	Exponentiation (e.g., 2 ** 3 = 8)<br>
%	`np.mod`	Modulus/remainder (e.g., 9 % 4 = 1)<br>

In [None]:
x = np.array([-2, -1, 0, 1, 2])
np.abs(x)

<b>Trigonometric functions:</b>

In [None]:
x = [-1, 0, 1]
print("x         = ", x)
print("arcsin(x) = ", np.arcsin(x))
print("arccos(x) = ", np.arccos(x))
print("arctan(x) = ", np.arctan(x))

<b>Exponents and logarithms</b>

In [None]:
x = [1, 2, 3]
print("x     =", x)
print("e^x   =", np.exp(x))
print("2^x   =", np.exp2(x))
print("3^x   =", np.power(3, x))

In [None]:
x = [1, 2, 4, 10]
print("x        =", x)
print("ln(x)    =", np.log(x))
print("log2(x)  =", np.log2(x))
print("log10(x) =", np.log10(x))

### Aggregations: Min, Max, and Everything In Between

In [None]:
L = np.random.random(100)
sum(L)

In [None]:
np.sum(L)

In [None]:
big_array = np.random.rand(1000000)
%timeit sum(big_array)
%timeit np.sum(big_array)

In [None]:
M = np.random.random((3, 4))
print(M)

In [None]:
M.min(axis=0) #within each column

In [None]:
M.max(axis=1) #within each row

<b>other aggregations:</b>

<b>Name	  NaN-safe Version	Description</b><br>
`np.sum`	`np.nansum`	Compute sum of elements<br>
`np.prod`	`np.nanprod`	Compute product of elements<br>
`np.mean`	`np.nanmean`	Compute mean of elements<br>
`np.std`	`np.nanstd`	Compute standard deviation<br>
`np.var`	`np.nanvar`	Compute variance<br>
`np.min`	`np.nanmin`	Find minimum value<br>
`np.max`	`np.nanmax`	Find maximum value<br>
`np.argmin`	`np.nanargmin`	Find index of minimum value<br>
`np.argmax`	`np.nanargmax`	Find index of maximum value<br>
`np.median`	`np.nanmedian`	Compute median of elements<br>
`np.percentile`	`np.nanpercentile`	Compute rank-based statistics of elements<br>
`np.any`	`N/A`	Evaluate whether any elements are true<br>
`np.all`	`N/A`	Evaluate whether all elements are true<br>

### Comparisons, Masks, and Boolean Logic:

In [None]:
x = np.array([1, 2, 3, 4, 5])
print (x)
print (x < 3) # less than
print (x > 3) # greater than
print (x <= 3)# less than or equal

In [None]:
np.less(x, 3)

`==`	`np.equal`		<br>
`!=`	`np.not_equal`<br>
`<`	`np.less`		<br>
`<=`	`np.less_equal`<br>
`>`	`np.greater`		<br>
`>=`	`np.greater_equal`<br>

In [None]:
rng = np.random.RandomState(0)
x = rng.randint(10, size=(3, 4))
x

In [None]:
x < 6

<b>Working with Boolean Arrays:</b>

In [None]:
print(x)
# how many values less than 6?
#To count the number of True entries in a Boolean array, np.count_nonzero is useful:
np.count_nonzero(x < 6)

In [None]:
# Another way to get at this information is to use np.sum; in this case, False is interpreted as 0, and True is interpreted as 1:
np.sum(x < 6)

In [None]:
# how many values less than 6 in each row?
#The benefit of sum() is that like with other NumPy aggregation functions, this summation can be done along rows or columns as well:
np.sum(x < 6, axis=1)

In [None]:
#f we're interested in quickly checking whether any or all the values are true, we can use (you guessed it) np.any or np.all:
np.any(x > 8)
np.any(x < 0)

### Sorting Arrays

<b>Fast Sorting in NumPy: np.sort and np.argsort:</b>

In [None]:
#To return a sorted version of the array without modifying the input, you can use np.sort:
x = np.array([2, 1, 4, 3, 5])
np.sort(x)

In [None]:
#If you prefer to sort the array in-place, you can instead use the sort method of arrays:
x.sort()
print(x)

In [None]:
#A related function is argsort, which instead returns the indices of the sorted elements:
x = np.array([2, 1, 4, 3, 5])
i = np.argsort(x)
print(i)

<b>Sorting along rows or columns:</b>

In [None]:
rand = np.random.RandomState(42)
X = rand.randint(0, 10, (4, 6))
print(X)

In [None]:
# sort each column of X
np.sort(X, axis=0)

In [None]:
# sort each row of X
np.sort(X, axis=1)

<b>Partial Sorts: Partitioning:</b>

Sometimes we're not interested in sorting the entire array, but simply want to find the k smallest values in the array. NumPy provides this in the np.partition function. np.partition takes an array and a number K; the result is a new array with the smallest K values to the left of the partition, and the remaining values to the right, in arbitrary order:

In [None]:
x = np.array([7, 2, 3, 1, 6, 5, 4])
np.partition(x, 3)

In [None]:
import numpy as np
import pandas as pd

## 2. Intro to Pandas

### 2.1 Pandas objects

<b>We'll be looking at two main objects of the Pandas library namely, Series and DataFrames.</b>

#### 2.1.1 Series
A series object is one-dimensional array/list of values that are indexed. Think of it like an indexed 'series' of values. 
<br>Let's look at some examples:

In [None]:
# List -> Series
numbers = pd.Series([1, 2, 3.4, 5.67, 8, 0.9])

names = pd.Series(['Alane', 'Ayanna', 'Tyisha', 'Jarvis', 'Tabetha', 'Geoffrey', 'Ken'])

print(numbers, '\n')
print(names)

As you can see a default index is added to the list of values. Lets add a <b>custom index</b>.

In [None]:
custom_index = 'abcdef'

# Please note how we use the attribute 'values' for a series object
numbers2 = pd.Series(numbers.values, index=list(custom_index))

numbers2

Let's look at the values in the series objects we've created, using indexes.

In [None]:
print('The second value in series numbers is: ', numbers[1], '\n')

print('The second value in series numbers2 is: ', numbers2['b'])

Another way to address it using indices can be the following:

In [None]:
print('The first three values in series numbers2 are:')

print(numbers2[:'c']) # numbers2['a':'c'] or numbers2[:3] or numbers2[1:3] or numbers2[:-3] work the same

Let's look at another way that we can create a series object: <b>Dictionaries</b>

In [None]:
locations_dict = {0:'California', 1:'New York', 2:'Virginia', 3:'Michigan', 4:'Texas', 5:'Nevada', 6:'Illinois'}

locations = pd.Series(locations_dict)

locations

#### 2.1.2 Data Frames
Pandas DataFrame object is generally a two-dimensional, size mutable, potentially heterogeneous tabular data with axes that are labeled. It can be considered to be a special form of a Python dictionary or a numpy array.
<br>Let's look at some examples:

In [None]:
# one of the most common ways to create a data frame
age = {0:5, 1:21, 2:12, 3:10, 4:30, 5:13, 6:70}

data1 = pd.DataFrame({'Name': names, 'Age': age, 'Location': locations})

data1

Let's play with the index a little (both the column and row index) while introducing a new way to create a data frame with pandas (using an existing data frame).

In [None]:
data2 = pd.DataFrame(data=data1).set_index('Name', drop=True)

data2

Let's create a data frame using <b>numpy arrays</b>.

In [None]:
data3 = pd.DataFrame(np.arange(12).reshape(6, 2), columns=['Even', 'Odd'], index=list(custom_index))

data3

### 2.2 Selection and Indexing of Data in Pandas

Let's look at a couple of ways we access the columns of a data frame in pandas.
<br><b>Note</b>: For this section, we'll use the '<b>data2</b>' data frame created earlier.

In [None]:
data2['Age']

In [None]:
data2.Age

In [None]:
data2.Age is data2['Age']

Both the above usages give the same result.

Now, we'll use this to operate on our data. We'll see how we can create a new column and enter values in that column by operating on an existing column.
<br><br>Let's say that the 'Age' information in the dataset is 10 years old and we need to add a new column that has the adjusted values. Following is how we can accomplish that:

In [None]:
data2['Age_current'] = data2['Age'] + 10

data2

We can also use same sized multiple series of data to perform similar operations.

Now we'll look at some attributes that can be used by a pandas DataFrame object.

In [None]:
# columns
data2.columns

In [None]:
# index
data2.index

In [None]:
# values
data2.values

In [None]:
# indexing the values
data2.values[1]

Now, let's look at a bit more sophisticated methods for indexing.
<br>We'll use the following:
1. <b>iloc</b>: simple array like implicit integer indexer
2. <b>loc</b>: uses explict index and column names

In [None]:
# first two columns and all rows except the first
data2.iloc[1:, :2]

In [None]:
# first two columns and all rows except the first
data2.loc['Tyisha':, :'Location']

Let's use what we've learned in this section to apply a mask to our data and output only selected columns like we would do using an SQL query.

In [None]:
data2.loc[data2.Age_current > 25, ['Age_current', 'Location']]

### 2.3 Handling missing values in Pandas

Pandas uses two types of sentinels for representing null values and those are <b>None</b> and <b>NaN</b>.
1. A <b>None</b> value is a python object generally used for representing null values in numpy arrays.
2. A <b>NaN</b> value is a floating point value. When operating on data with 'NaN' values the operations tend to give out unwanted results.
<br>We'll look at both in the following cells:

In [None]:
missing1 = np.array([1, 2, 3, None])
missing1

In [None]:
missing2 = np.array([1, 2, 3, np.nan])
missing2
# missing2.dtype

In [None]:
# operating a value with NaN will result in NaN
print('Addition: (0 + {}) = {}'.format(np.nan, (0+np.nan)))
print('Multiplication: (1 * {}) = {}'.format(np.nan, (1*np.nan)))

<br>In Pandas, 'None' values are interchanged into 'NaN' values due to type casting as and when required. Let's look at an example:

In [None]:
# example = pd.Series([1, 2, 3])
example = pd.Series([1, 2, np.nan, 3, None])
example

<br>Now we'll se how we can operate on null values in pandas. Following are some functions we'll be looking at:
1. isnull( )
2. notnull( )
3. fillna( )
4. dropna( )

In [None]:
# isnull returns a boolean mask for the data
example.isnull()

In [None]:
# notnull also returns a boolean mask for the data but it's opposite to isnull
example.notnull()

In [None]:
# use either of those masks to access the data
example[example.notnull()]

Now we'll use the pandas data frame to see how the functions <b>fillna( )</b> and <b>dropna( )</b> work.

In [None]:
missing3 = data1.copy()
missing3.loc[3:5, 'Age'] = np.nan
missing3

In [None]:
# dropping rows with null values
missing3.dropna(inplace=False)

In [None]:
# let's fill these values with the average of the non-null values.
missing3.fillna(missing3.dropna()['Age'].mean(), inplace=True)
missing3

### 2.4 Combining Datasets

#### 2.4.1 concat( ) and append( )
First we'll look at some simple functions that are used to combine datasets, <b>concat( ) and append( )</b>.
<br>Let's look at pd.concat( ) first. 
<br><b>Note</b>: We'll use data frames for our examples, but series can be used just the same.

In [None]:
# create two new data frames
sample1 = pd.DataFrame(data={'A':[1, 2], 'B':[3, 4]}, index=[0, 1])
sample2 = pd.DataFrame(data={'A':[5, 6], 'B':[7, 8]}, index=[0, 1])
display('sample1', sample1, 'sample2', sample2)

In [None]:
# now we'll see 2 ways of concatinating them

# row-wise concatenation
sample3 = pd.concat([sample1, sample2], axis=0)

# column-wise concatenation
sample4 = pd.concat([sample1, sample2], axis=1)

display('sample3',sample3, 'sample4', sample4)

An important parameter for this function is '<b>join</b>'. Let's look at an example:

In [None]:
# create two new dataframes
sample5 = sample1.copy()
sample6 = pd.DataFrame(data={'B':[9, 10], 'C':[11, 12]}, index=[0, 1])

display('sample5', sample5, 'sample6', sample6)

In [None]:
# join these data frames over B using pd.concat()
display('Joined', pd.concat([sample5, sample6], join='inner'))

Now, we'll look at <b>append( )</b> function. Though it gives a simmilar result as <b>concat( )</b> gives, but is a bit limited in its implementation as it only 'appends' row-wise.

In [None]:
display('append', sample1.append(sample2))

#### 2.4.2 Merge and Join
Now we'll look at some of the more sophisticated methods for combining datasets.
<br>First we'll see how to 'merge' datasets. We'll do that using <b>pd.merge( )</b>.

In [None]:
left = pd.DataFrame({'A': [1, 2], 'B': [2, 3]})
right = pd.DataFrame({'A': [4, 5, 6], 'B': [2, 2, 2]})

display('left', left, 'right', right)

In [None]:
# outer join over column 'B'
merger = pd.merge(left, right, on='B', how='outer', indicator=True)
merger

In addition to columns, we can also merge on index.

In [None]:
# create new data frames
df1 = pd.DataFrame({'Age':list(age.values())}, index=names)
df2 = pd.DataFrame({'Dept_ID':[1, 2, 3, 4, 1, 3, 1]}, index=names)
display('df1', df1, 'df2', df2)

In [None]:
# let's use index to merge
pd.merge(df1, df2, left_index=True, right_index=True)

The <b>join( )</b> function does the same thing as it primarily works on index.

In [None]:
df3 = df1.join(df2)
df3

Now let's look at how we would merge data frames with different sizes and columns.

In [None]:
# data frame for department 
df4 = pd.DataFrame({'Department_ID':[1, 2, 3, 4, 5], 'Department':['CSE', 'FRE', 'ECE', 'DS', 'BIO']})
df4

This data frame is strictly dedicated to the information about a department. 'DepartmentID' is the identifying column (primary key) in this case while it's also present in 'df3' as 'DeptID' (as the foreign key) which has the information about person name and their age. Now we'll see ways to combine these datasets.
<br>As the names of the department ID columns are different in both dataframes, instead of just using the 'on' parameter as earlier, we'll have to specify columns from both of them data frames on which to join/merge.

In [None]:
# inner join
pd.merge(df3, df4, left_on='Dept_ID', right_on='Department_ID', how='inner')

In [None]:
# outer join
pd.merge(df4, df3, left_on='Department_ID', right_on='Dept_ID', how='outer', indicator=True)

### 2.5 Grouping and Aggregation

In this section we'll see how we can leverage the properties of a data frame and perform certain groupings and aggregations to better understand the data. Let's look at some examples.

In [None]:
# new series for number of hours worked
hours = pd.Series([30, 21, 40, 40, 35, 50, 15])
hours

In [None]:
# simple aggregations on series
print('Hours sum:', hours.sum())
print('Hours mean', hours.mean())

In [None]:
# add hours to df1
df1['Hrs_Worked/week'] = hours.values
df1

In [None]:
# calculate mean column-wise
df1.mean()

In [None]:
# calculate mean row-wise
df1.mean(axis='columns')

Let's look at the <b>describe( )</b> function that is usually used to look at these aggregations in a data frame.

In [None]:
# describe()
df1.describe()

Now, we'll look at how to group the data and perform operations. This is one of the most important and widely used transformation on data frames.
<br>The function we'll use here is <b>groupby( )</b>.

In [None]:
# new data frame with Dept_ID
df5 = df1.join(df2).reset_index().rename(columns={'index':'Name'})
df5

Aggregations on <b>groupby( )</b>.

In [None]:
# display names of people with maximum age in each department
df5.groupby('Dept_ID')['Name', 'Age'].max()

Filtering on <b>groupby( )</b>.

In [None]:
# display departments with minimum age greater than 10
df5.groupby('Dept_ID').filter(lambda x: x['Age'].min()>10)

Applying functions over dataframe after <b>groupby( )</b>.

In [None]:
# normalize hours worked per week for each department
def func_normalize(x):
    x['Hrs_Worked/week'] = x['Hrs_Worked/week']/(x['Hrs_Worked/week'].sum())
    return x
    
df5.groupby('Dept_ID').apply(func_normalize)

### 2.6 Importing data in Pandas

Here we'll look at one of the most common file formats used i.e. CSV.
<br>There are many more formats supported by pandas. For the purpose of this exercise, we'll only look at importing CSV files.

In [None]:
# get US states data 
population = pd.read_csv('data-USstates/state-population.csv')
area = pd.read_csv('data-USstates/state-areas.csv')
abbrev = pd.read_csv('data-USstates/state-abbrevs.csv')

In [None]:
population.head()

In [None]:
area.head()

In [None]:
abbrev.head()

Let's apply what we learned above and merge these three files.

In [None]:
df6 = pd.merge(area, abbrev, on='state')
df6.head()

In [None]:
df7 = pd.merge(population, df6, left_on='state/region', right_on='abbreviation', how='outer').drop('abbreviation', axis=1)
df7.head()

In [None]:
df7.describe()

In [None]:
# finally lets apply a grouping to this data
df7.groupby(['state/region', 'ages', 'year'])[df7.columns].filter(
    lambda x: x['population'].min()>3000000 and x['area (sq. mi)'].min()>50000
)

## 3. Intro to Matplotlib

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')

In [None]:
#For all Matplotlib plots, we start by creating a figure and an axes.
fig = plt.figure()
ax = plt.axes()

In [None]:
fig = plt.figure()
ax = plt.axes()
#Start point is 0 and end point is 10 number of samples is 1000
x = np.linspace(0, 10, 1000)
ax.plot(x, np.sin(x));

In [None]:
#can also be done like this
plt.plot(x, np.sin(x));

In [None]:
#Compare sin and cos
plt.plot(x, np.sin(x))
plt.plot(x, np.cos(x));

<b>Adjusting the Plot: Line Colors and Styles:</b>

In [None]:
plt.plot(x, np.sin(x - 0), color='blue')        # specify color by name
plt.plot(x, np.sin(x - 1), color='g')           # short color code (rgbcmyk)
plt.plot(x, np.sin(x - 2), color='0.75')        # Grayscale between 0 and 1
plt.plot(x, np.sin(x - 3), color='#FFDD44')     # Hex code (RRGGBB from 00 to FF)
plt.plot(x, np.sin(x - 4), color=(1.0,0.2,0.3)) # RGB tuple, values 0 to 1
plt.plot(x, np.sin(x - 5), color='chartreuse'); # all HTML color names supported

<b>adjusting line style:</b>

In [None]:
plt.plot(x, x + 0, linestyle='solid')
plt.plot(x, x + 1, linestyle='dashed')
plt.plot(x, x + 2, linestyle='dashdot')
plt.plot(x, x + 3, linestyle='dotted');

# For short, you can use the following codes:
plt.plot(x, x + 4, linestyle='-')  # solid
plt.plot(x, x + 5, linestyle='--') # dashed
plt.plot(x, x + 6, linestyle='-.') # dashdot
plt.plot(x, x + 7, linestyle=':');  # dotted

In [None]:
plt.plot(x, x + 0, '-g')  # solid green
plt.plot(x, x + 1, '--c') # dashed cyan
plt.plot(x, x + 2, '-.k') # dashdot black
plt.plot(x, x + 3, ':r');  # dotted red

<b>Labeling Plots:</b>

In [None]:
plt.plot(x, np.sin(x))
plt.title("A Sine Curve")
plt.xlabel("x")
plt.ylabel("sin(x)");

In [None]:
plt.plot(x, np.sin(x), '-g', label='sin(x)')
plt.plot(x, np.cos(x), ':b', label='cos(x)')
plt.axis('equal')

plt.legend();

In [None]:
#In the previous section we looked at plt.plot/ax.plot to produce line plots. 
#It turns out that this same function can produce scatter plots as well:
x = np.linspace(0, 10, 30)
y = np.sin(x)

plt.plot(x, y, 'o', color='black');

#The third argument in the function call is a character that represents the type of symbol used for the plotting. 
#Just as you can specify options such as '-', '--'


In [None]:
#A second, more powerful method of creating scatter plots is the plt.scatter
plt.scatter(x, y, marker='o');

Now, let's use the US states data we imported earlier to create some visualizations in matplotlib.

In [None]:
fig1, a= plt.subplots(1, 1, figsize=(16, 12))
a.barh(area['state'], width=area['area (sq. mi)']/1000)
a.set_xlabel('Area (*1000 sq. miles)')
a.set_ylabel('State')