# Hello World - JupyterLab, Pandas, Scikit-Learn, PyTorch
Author: Carwyn Collinsworth
Date: 01/25/2023

In [32]:
import pandas as pd
import numpy as np
import torch
import sklearn

## Numpy Introduction

### Instantiating arrays

In [58]:
# Defining a basic numpy array
a = np.array([0,1,2,3], dtype = np.uint8)
b = np.array([[1,2],[3,4]], dtype = np.int32)

# Specifying a size of array
c = np.arange(10)
print("arange", c)
d = np.linspace(1,5,9) # Format Start, Stop, Number of elements
print("linspace", d)
g = np.zeros((3,2)) # Format rows, columns
print("\nzeros (3,2)\n", g)
h = np.ones((2,2))*5
print("ones (2,2)*5\n", h)

# 2D array size instantiation
e = np.eye(3)
print("\neye: \n", e, "\n")
f = np.diag([1,3,2,5])
print("diag: \n", f, "\n")

# Numpy random numbers
from numpy.random import default_rng
r = default_rng(42).random((2,3))
print("random: \n", r)

arange [0 1 2 3 4 5 6 7 8 9]
linspace [1.  1.5 2.  2.5 3.  3.5 4.  4.5 5. ]

zeros (3,2)
 [[0. 0.]
 [0. 0.]
 [0. 0.]]
ones (2,2)*5
 [[5. 5.]
 [5. 5.]]

eye: 
 [[1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]] 

diag: 
 [[1 0 0 0]
 [0 3 0 0]
 [0 0 2 0]
 [0 0 0 5]] 

random: 
 [[0.77395605 0.43887844 0.85859792]
 [0.69736803 0.09417735 0.97562235]]


### Indexing arrays

In [66]:
# Indexing a 2d array
print("\nformat: [row, column]: ", r[1,2])
print("use negative numbers to index backwards: ", c[-1])
# Note that x[0, 2] == x[0][2], but x[0][2] is more inefficient because a temporary array x[0] is created

# Slicing
print("slicing [-5:9:2] on array [0 1 2 3 4 5 6 7 8 9] should return [5,7]", c[-5:9:2])
print("negating the sampling index (and fixing bounds: inclusion exclusion) makes it reverse in order", c[7:-6:-2])

# Use : to get all of a dimension, to expand to all dimensions, use ellipses
m = np.array([[[1],[2],[3]], [[4],[5],[6]]])
print("\nm[:,0,:] should equal [[1 2 3],[4 5 6]]\n", m[:,:,0])
print("equivalently, m[...,0]: \n", m[...,0])


format: [row, column]:  0.9756223516367559
use negative numbers to index backwards:  9
slicing [-5:9:2] on array [0 1 2 3 4 5 6 7 8 9] should return [5,7] [5 7]
negating the sampling index (and fixing bounds: inclusion exclusion) makes it reverse in order [7 5]

m[:,0,:] should equal [[1 2 3],[4 5 6]]
 [[1 2 3]
 [4 5 6]]
equivalently, m[...,0]: 
 [[1 2 3]
 [4 5 6]]


### Merging and joining arrays

In [43]:
# Merging, joining arrays
# Note that numpy arrays are objects, you have to copy otherwise they are referential
a = np.arange(4)
i = a[0:2]
i += 1
print("variable a is impacted by updating i: ", a, i)

# Copying a variable
a = np.arange(4)
i = a[0:2].copy()
i += 1
print("variable a is not impacted by updating copy i: ", a, i)


# Use np.vstack, np.hstack, or np.block to concatenate arrays
j = np.array([1,2])
k = np.array([3,4])
print("\n[1,2], [3,4] joined vertically using vstack: \n", np.vstack([j,k]))
print("\n[1,2], [3,4] joined horizontally using hstack: \n", np.hstack([j,k]))

variable a is impacted by updating i:  [1 2 2 3] [1 2]
variable a is not impacted by updating copy i:  [0 1 2 3] [1 2]

[1,2], [3,4] joined vertically using vstack: 
 [[1 2]
 [3 4]]

[1,2], [3,4] joined horizontally using hstack: 
 [1 2 3 4]


### Loading array from file

In [96]:
l = np.loadtxt("sample.txt", delimiter=",", skiprows=1, dtype=np.uint8) # Skips the first skiprows rows (in this case that is the header)
print(l)

# Use genfromtxt() for text -> array with formatting

# With autostrip - removes white space in character arrays
from io import StringIO
data = u"1, abc , 2\n 3, xxx, 4"
p = np.genfromtxt(StringIO(data), delimiter=",", dtype="|U5", autostrip=True)
print(p)

[[1 2]
 [3 4]
 [5 6]
 [7 8]]
[['1' 'abc' '2']
 ['3' 'xxx' '4']]


### Manipulating arrays

In [105]:
c = np.arange(10)
print("c: ", c)
c = c.reshape((5,2))
print("c reshaped to (5,2):\n", c)

n = np.array([[1,2,np.nan], [np.nan,2, 1]])
print("selecting all non nan elements:", n[~np.isnan(n)]) #note, does not work for None

o = (n > 1)
print("boolean finding all values > 1: ", o)
o = (np.isnan(n)) # == np.nan does not work!!!
print("boolean finding all values > 1: ", o)

# Ufunc +
s = np.array([[1,2,3],[4,5,6]])
t = np.array([[2,3,4],[1,2,3]])
print("s+t=\n", s+t)
print("reduce add s on axis 1:", np.add.reduce(s,1))
print("reduce mul s on axis 1:", np.multiply.reduce(s,1))

c:  [0 1 2 3 4 5 6 7 8 9]
c reshaped to (5,2):
 [[0 1]
 [2 3]
 [4 5]
 [6 7]
 [8 9]]
selecting all non nan elements: [1. 2. 2. 1.]
boolean finding all values > 1:  [[False  True False]
 [False  True False]]
boolean finding all values > 1:  [[False False  True]
 [ True False False]]
s+t=
 [[3 5 7]
 [5 7 9]]
reduce add s on axis 1: [ 6 15]
reduce mul s on axis 1: [  6 120]


### Python "is" subtopic

In [93]:
# Python "is" IMPORTANT NOTE - do not use for immutable (~primative) objects

a, b = 200, 201
print("is", a is b-1)
print("==", a == b-1)

a, b = 300, 301
print("is", a is b-1)
print("==", a == b-1)


# Another example
print("\nAnother example")
def add(a, b): return a + b
print("is", 16 is add(8, 8))
print("is", 1000 is add(500, 500))

is True
== True
is False
== True

Another example
is True
is False


### Views and copies

In [101]:
### Use base to see if you are referring to the original or a view

c = np.arange(10)
print(c)
q = c[0:2]
print(q)

print("Is c a view?: ", c.base is not None)
print("Is q a view?: ", q.base is not None) # q.base returns the original if there is one

r = q[0:1]
print("Is r a view?: ", r.base is not None) #works through layers of views too
print("r view?: ", r.base)

[0 1 2 3 4 5 6 7 8 9]
[0 1]
Is c a view?:  False
Is q a view?:  True
Is r a view?:  True
r view?:  [0 1 2 3 4 5 6 7 8 9]


## Linear algebra operations

In [123]:
u = np.array([[1,2,3],[4,5,6]])
print("u: ", u)
print("u.T: ", u.T)
print("np.transpose(u): ", np.transpose(u))

# Invert matrix:
u = np.vstack([u, np.array([2,2,3])])
print("\nu = \n", u)
inv = np.linalg.inv(u)
print("\nInverse: \n", inv)

# Matrix multiplication vs elementwise
print("np.multiply does elementwise multiplication - u^-1 element* u != I: \n", np.multiply(inv, u))
print("np.dot does matrix multiplication or dot product - u^-1*u == I: \n", np.dot(inv,u))
print("* does element wise multiplication - u^-1*u == I: \n", inv*u)

# np.dot is better than matmul. Matmul cannot take in scalars, while dot handles whatever types you put in, scalar, matrix, 1d array, etc.

u:  [[1 2 3]
 [4 5 6]]
u.T:  [[1 4]
 [2 5]
 [3 6]]
np.transpose(u):  [[1 4]
 [2 5]
 [3 6]]

u = 
 [[1 2 3]
 [4 5 6]
 [2 2 3]]

Inverse: 
 [[-1.          0.          1.        ]
 [ 0.          1.         -2.        ]
 [ 0.66666667 -0.66666667  1.        ]]
np.multiply does elementwise multiplication - u^-1 element* u != I: 
 [[ -1.           0.           3.        ]
 [  0.           5.         -12.        ]
 [  1.33333333  -1.33333333   3.        ]]
np.dot does inner product - u^-1*u == I: 
 [[1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]]
* does inner product - u^-1*u == I: 
 [[ -1.           0.           3.        ]
 [  0.           5.         -12.        ]
 [  1.33333333  -1.33333333   3.        ]]


## Pandas Introduction

### Series

In [131]:
# Creating a Series, a one dimensional array

pa = pd.Series([1,2,4,np.nan,5,None,6])
print(pa[3], pa[5], pa[3] == pa[5], np.isnan(pa[3]), np.isnan(pa[5]))
pa

nan nan False True True


0    1.0
1    2.0
2    4.0
3    NaN
4    5.0
5    NaN
6    6.0
dtype: float64

### Instantiating a dataframe

In [132]:
df2 = pd.DataFrame(
    {
        "A": 1.0,
        "B": pd.Timestamp("20130102"),
        "C": pd.Series(1, index=list(range(4)), dtype="float32"),
        "D": np.array([3] * 4, dtype="int32"),
        "E": pd.Categorical(["test", "train", "test", "train"]),
        "F": "foo",
    }
)
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


### Loading from file

In [178]:
df = pd.read_csv('cities.csv', index_col=None)
df

Unnamed: 0,LatD,"""LatM""","""LatS""","""NS""","""LonD""","""LonM""","""LonS""","""EW""","""City""","""State"""
0,41,,,"""N""",80,39,0,"""W""","""Youngstown""",OH
1,42,52.0,48.0,"""N""",97,23,23,"""W""","""Yankton""",SD
2,46,35.0,59.0,,120,30,36,"""W""","""Yakima""",WA
3,42,16.0,12.0,"""N""",71,48,0,"""W""","""Worcester""",MA
4,43,37.0,48.0,"""N""",89,46,11,"""W""","""Wisconsin Dells""",WI
...,...,...,...,...,...,...,...,...,...,...
123,39,31.0,12.0,"""N""",119,48,35,"""W""","""Reno""",NV
124,50,25.0,11.0,"""N""",104,39,0,"""W""","""Regina""",SA
125,40,10.0,48.0,"""N""",122,14,23,"""W""","""Red Bluff""",CA
126,40,19.0,48.0,"""N""",75,55,48,"""W""","""Reading""",PA


### Visualization

In [154]:
# use df.head() to view first 5 rows
df.head()

Unnamed: 0,LatD,"""LatM""","""LatS""","""NS""","""LonD""","""LonM""","""LonS""","""EW""","""City""","""State"""
0,41,,,"""N""",80,39,0,"""W""","""Youngstown""",OH
1,42,52.0,48.0,"""N""",97,23,23,"""W""","""Yankton""",SD
2,46,35.0,59.0,,120,30,36,"""W""","""Yakima""",WA
3,42,16.0,12.0,"""N""",71,48,0,"""W""","""Worcester""",MA
4,43,37.0,48.0,"""N""",89,46,11,"""W""","""Wisconsin Dells""",WI


In [155]:
# use df.tail() to view last 5 rows
df.tail()

Unnamed: 0,LatD,"""LatM""","""LatS""","""NS""","""LonD""","""LonM""","""LonS""","""EW""","""City""","""State"""
123,39,31.0,12.0,"""N""",119,48,35,"""W""","""Reno""",NV
124,50,25.0,11.0,"""N""",104,39,0,"""W""","""Regina""",SA
125,40,10.0,48.0,"""N""",122,14,23,"""W""","""Red Bluff""",CA
126,40,19.0,48.0,"""N""",75,55,48,"""W""","""Reading""",PA
127,41,9.0,35.0,"""N""",81,14,23,"""W""","""Ravenna""",OH


In [162]:
# use df.info() to view summary
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 128 entries, 0 to 127
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   LatD      128 non-null    int64  
 1    "LatM"   127 non-null    float64
 2    "LatS"   126 non-null    float64
 3    "NS"     126 non-null    object 
 4    "LonD"   128 non-null    int64  
 5    "LonM"   128 non-null    int64  
 6    "LonS"   128 non-null    int64  
 7    "EW"     128 non-null    object 
 8    "City"   128 non-null    object 
 9    "State"  128 non-null    object 
dtypes: float64(2), int64(4), object(4)
memory usage: 10.1+ KB


In [161]:
# use df.describe() to view summary stats
df.describe()

Unnamed: 0,LatD,"""LatM""","""LatS""","""LonD""","""LonM""","""LonS"""
count,128.0,127.0,126.0,128.0,128.0,128.0
mean,38.820312,30.968504,27.460317,93.25,27.742188,26.960938
std,5.200596,16.329415,18.759808,15.466499,16.927937,18.727807
min,26.0,1.0,0.0,71.0,0.0,0.0
25%,35.0,16.0,11.0,80.0,14.0,11.0
50%,39.0,31.0,24.0,89.5,26.5,23.5
75%,42.25,45.0,47.0,103.25,40.25,47.0
max,50.0,59.0,59.0,123.0,58.0,59.0


In [175]:
# Print column names
print(df.columns)
print(df.columns.str)

Index(['LatD', ' "LatM"', ' "LatS"', ' "NS"', ' "LonD"', ' "LonM"', ' "LonS"',
       ' "EW"', ' "City"', ' "State"'],
      dtype='object')
<pandas.core.strings.accessor.StringMethods object at 0x7efc6774e250>


In [180]:
# Fix column names so they don't include spaces or quotes
df.columns = df.columns.str.replace(' ','')
df.columns = df.columns.str.replace('"','')

In [182]:
# For categorical information, use unique
for col in df:
    print(col)
    
df['State'].unique() # State has a weird format, so lets fix all the titles

LatD
LatM
LatS
NS
LonD
LonM
LonS
EW
City
State


array([' OH', ' SD', ' WA', ' MA', ' WI', ' NC', ' MB', ' VA', ' DE',
       ' ND', ' PA', ' WV', ' TX', ' KS', ' FL', ' CA', ' GA', ' IL',
       ' NY', ' IA', ' CT', ' DC', ' IN', ' MS', ' BC', ' ID', ' AL',
       ' OK', ' AZ', ' CO', ' NJ', ' MI', ' ON', ' SC', ' MO', ' LA',
       ' WY', ' NB', ' NM', ' UT', ' MD', ' OR', ' MN', ' VT', ' NV',
       ' SA', ' OH '], dtype=object)