## numpy

In [None]:
import numpy as np

In [None]:
# numpy's array method takes any list, tuple or array-like object and converts it to an ndarray
# note that numpy arrays are of homgenous type...not mixed type like lists.
an_array = np.array([i for i in range(10)])
print(an_array)
print(type(an_array))

In [None]:
# nested arrays are arrays that have arrays as values

# a 0-D array is just a scalar...each value in a 1D array is a 0-D array itself
scalar = np.array(20)

In [None]:
# a 1-D array has scalars as its elements.  Think of a single vector of scalars.
basic_array = np.array([1, 2, 3, 4, 5])
print(basic_array)

In [None]:
# a 2-D array is just an array of arrays. Think of a matrix or table.
two_dim_array = np.array([[1, 2, 3], [4, 5, 6]])
print(two_dim_array)
# we can evaluate the shape attribute of ndarrays as well
print(two_dim_array.shape)

In [None]:
# a 3-D array has 2-D array elements.
three_dim_array = np.array([[[2, 4, 6], [8, 10, 12]], [[14, 16, 18], [20, 22, 24]]])
print(three_dim_array)

In [None]:
# the ndim attribute gives the number of dimensions in an ndarray
# Note that ndarrays can have an arbitrarily large number of dimensions.
print(three_dim_array.ndim)

## Indexing ndarrays

In [None]:
# we index 1D arrays just like lists and tuples
one_dim = np.array([i for i in range(10)])
print(one_dim[1:5])

In [None]:
# to index 2D arrays we use comma separated values to address dimension and index
# think of the 1st dimension as the row and the index as the column
two_dim = np.array([[1, 2, 3], [4, 5, 6]])
print(two_dim[1, 2])
print(two_dim[0, 0])

In [None]:
# higher dimensional arrays are indexed similarly, the first integer represents the first dimension,
# the second integer represents the second dimension and so on.
three_dim = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]])
print(three_dim)
print(three_dim[0, 1, 2]) # prints third element of second array of first array

In [None]:
# above the 0 allows us to access the first 2D array...see below
print(three_dim[0])

In [None]:
# from there on the next two positions are just accessing values within the first 2D array
print(three_dim[0, 1]) # this returns second row from first 2D array

In [None]:
# Here we're accessing the second 2D array and then the first row of that array and the second row element.
print(three_dim[1, 0, 1])

In [None]:
# We can use negative indexing as well.
# Here we access the last element of the last row of the last 2D array.
print(three_dim[-1, -1, -1])

In [None]:
# and here we access the first element of the last row of the first array
print(three_dim[0, -1, 0])

In [None]:
# and now the first element of the first row of the first array
print(three_dim[0, -2, 0])

## Slicing ndarrays

Slicing is also similar to what we've experienced with lists and tuples.
We slice with [start: end] or [start: end: step]
As before, omitting the starting index assumes zero and omitting the end assumes the length of the array

In [None]:
# We'll skip an explanation of slicing 1D arrays and jump to higher dimensions
two_dim = np.array([[1, 2, 3, 4], [5, 6, 7, 8]])
# Here we're slicing from index 1 to 3 of the second dimension (dimension index 1)
print(two_dim[1, 1:3])

In [None]:
# Now we're accessing the last two values from both dimensions
# This will return a 2D array
print(two_dim[0:2, 2:4])

In [None]:
# the following does the same as the preceding line
print(two_dim[:, 2:4])

In [None]:
# 3D arrays are sliced similarly
three_dim = np.array([[[1, 2, 3, 4], [5, 6, 7, 8]], [[9, 10, 11, 12], [13, 14, 15, 16]]])

In [None]:
# accessing last two elements of last row of second 2D array.
print(three_dim[1, 1, 2:])

In [None]:
# returns a 2D array
print(three_dim[0:2, 1, 1:])

In [None]:
# returns a 3D array
print(three_dim[0:2, 0:2, 1:])

## Data types in numpy

Numpy supports strings (S), integers (i), floats (f), bools (b) and complex numbers (c)
and also has some additional data types such as:
unsigned integer (u), timedelta (m), datetime (M), object (O), unicode string (U)

In [None]:
# ndarrays have an attribute dtype that will reveal the datatype of the array
two_dim = np.array([[1, 2, 3, 4], [5, 6, 7, 8]])
print(two_dim.dtype)

In [None]:
foo = np.array([i for i in 'abcdefg'])
print(foo.dtype)
# notice that the dtype here was unicode string
# we can be explicit about the dtype when creating an array

In [None]:
foo = np.array([i for i in 'abcdefg'], dtype = 'S')
print(foo.dtype)

In [None]:
# size can also be specified for i, u, f, S and U
foo = np.array([15, 16, 17, 16,], dtype = 'S2')
print(foo.dtype)

In [None]:
# notice what happens when we specify single byte string dtype with these ints...our values are truncated.
foo = np.array([15, 16, 17, 16,], dtype = 'S1')
print(foo)
# an exception will occur if you try to specify a dtype to which the values in the array can't be cast

In [None]:
# oftentimes you need to cast an entire array to another type
# numpy offers the method astype() which takes the new type as a parameter and returns a copy.
# datatypes can be specified with the single char version or the name. For example 'f' or float.

an_array = np.array([1, 2, 3, 4, 5])
an_array.astype('S') # this will just return a copy, so we need to assign it to a variable or overwrite it

In [None]:
an_array = an_array.astype('S')
print(an_array.dtype)

In [None]:
# copy() and view() methods
# These concepts are related to the aliasing of variables

# In numpy we can use the copy() method to make copies of arrays.
# Changes made to the original or the copy have no impact on another.
# A view() of an array just points to the original array, so changes made
# to the original or the view will impact the other.

# .copy()
an_array = np.array([1, 2, 3, 4])
a_copy = an_array.copy()

In [None]:
an_array[0] = 99
a_copy[1] = 99

print(an_array)
print(a_copy)

In [None]:
# .view()
an_array = np.array([1, 2, 3, 4])
a_view = an_array.view()

an_array[0] = 99
a_view[1] = 99

print(an_array)
print(a_view)

In [None]:
# We can think about this in terms of ownership of the data.
# A copy owns the data and a view does not.
# Data ownership can be assessed using the base attribute of an ndarray.

an_array = np.array([1, 2, 3, 4])
a_copy = an_array.copy()
a_view = an_array.view()

# If the array owns the data the base attribute will return None
# If not the base attribute returns a reference to the original object
print(a_copy.base)
print(a_view.base)

In [None]:
# if we modify an element in the original array returned from the base attribute
# it will modify the original array.
a_view.base[0] = 99

# now if we print the original array, the copy and the view,
# the original and the view will have been modified by the preceding statement.
print(an_array)
print(a_copy)
print(a_view)

In [None]:
# the shape attribute returns a tuple (of length .ndim) with the corresponding number of elements in that index
one_dim = np.array([1, 2, 3, 4])
two_dim = np.array([[1, 2, 3, 4], [5, 6, 7, 8]])
three_dim = np.array([[[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]], [[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]]])

In [None]:
print(one_dim.shape) # one dimension with 4 elements
print(two_dim.shape) # two dimensions with 4 elements. Or two rows and 4 columns.
print(three_dim.shape) # three dimensions. Two three x four 2D arrays.

In [None]:
# reshape()
# The reshape method allows us to change the shape of an array or add/remove elements from dimensions.
# You can reshape into any array as long as you have enough elements to achieve that shape.
# for example, if we had a 1D array of length 9 then we couldn't reshape into a 2D array of shape (2, 5)
an_array = np.array([i for i in range(1, 21)])


In [None]:
# this, of course, won't work
an_array.reshape(7, 3)

In [None]:
# this, however, is feasible
new_array = an_array.reshape(4, 5)
print(new_array)

In [None]:
# or this
three_dim = an_array.reshape(2, 5, 2)
print(three_dim) # now we have 3D array consisting of two five x two arrays

In [None]:
# Note that reshaping returns a view
print(an_array.reshape(2, 5, 2).base)

In [None]:
# so if we alter a value in the reshaped array then we'll alter the original array we reshaped.
an_array.reshape(2, 5, 2)[0, 0, 0] = 99
print(an_array)

In [None]:
# We assigned the reshaped object to the variable three_dim, and changes made here
# will also be reflected in the original array
print(three_dim.base)

In [None]:
# this operation will also have an impact on our original array.
three_dim[0, 0, 1] = 99
print(an_array)

In [None]:
# Note that we can avoid this by saving a copy of the view.
one_dim = np.array([1, 2, 3, 4, 5, 6, 7, 8])
two_dim = one_dim.reshape(4, 2).copy()
two_dim[0, 0] = 99
print(two_dim.base)
print(two_dim)
print(one_dim)

## np.zeros and np.ones

We can also initialize the values of the array generally


In [None]:
# np.zeros() takes a shape and will initialize an ndarray
print(np.zeros(10))

In [None]:
three_dim_z = np.zeros((2,3,4))
print(three_dim_z)

In [None]:
# np.ones()
print(np.ones(10))

In [None]:
two_dim_ones = np.ones((4,4))
print(two_dim_ones)

In [None]:
# np.random.random()
print(np.random.random(10))

In [None]:
two_dim_rand = np.random.random((4, 4))
print(two_dim_rand)

In [None]:
# np.arange() similar to range()...takes start stop and step values

three_dim = np.arange(1, 51).reshape(5, 2, 5)
print(three_dim)

In [None]:
# operations between two vectors
# numpy allows us to easily do operations between arrays
print(two_dim_ones + two_dim_rand)
print(two_dim_ones / two_dim_rand)

In [None]:
# we can also do operations between scalars and ndarrays
# this gives us a good toolset for performing matrix operations.
print(two_dim_ones * 1.5)

In [None]:
# There are a number of array methods that provide valuable aggregate information
# .min(), .max(), .sum(), mean(), std()
print(two_dim_rand.max())
print(two_dim_rand.min())
print(two_dim_rand.sum())
print(two_dim_rand.mean())
print(two_dim_rand.std())

In [None]:
# get the mean of first column of two_dim_rand
print(two_dim_rand[:,0].mean())

In [None]:
# we can specify the axis of the ndarray to accomplish this as well
# axis 0 computes along rows and axis 1 computes along columns
print(two_dim_rand.mean(axis=1)) # aggregates along column, so will return means for each row
print(two_dim_rand.mean(axis=0)) # aggregates along rows, so will return means for each column

In [None]:
# We can calculate dot products with the .dot() method
array_1 = np.array([1, 2, 3, 4])
array_2 = np.array([1, -2, -3, 4])

print(array_1.dot(array_2))

In [None]:
# transposition
matrix = np.random.random((2, 4))
print(matrix)
print(matrix.T)

## Iterating over ndarrays

In [None]:
# prints elements
for i in np.array([i for i in range(10)]):
    print(i)

In [None]:
# prints rows
for i in np.arange(1,21).reshape(4, 5):
    print(i)

In [None]:
# prints 2-D matrices
for i in np.random.random((2,5,2)):
    print(i)

In [None]:
# nditer()
for i in np.nditer(np.random.random((2,5,2))):
    print(i)

In [None]:
# ndenumerate
for i, j in np.ndenumerate(np.random.random((2,5,2))):
    print(i, j)

## pandas

We'll discuss grouping, pivoting, merging and other more complex operations next week.  We'll start out with some pandas basics.

In [None]:
import pandas as pd
from pandas import DataFrame

In [None]:
# note that the string argument for read_csv can be a url or a file path
data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data')

In [None]:
data.head()

In [None]:
# we allowed pandas to infer the existence of a header...there are none, so let's be explicit.
# we can set header equal to None or we can specify column names.

data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data',
                  names=['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'class'])

In [None]:
data.head()
# that's better

In [None]:
data.index

In [None]:
data.columns

In [None]:
# retrieve dtypes for columns

data.dtypes

In [None]:
# oftentimes it's helpful to have access to a sorted array of features
for i in data.columns.sort_values():
    print(i)

In [None]:
data.describe()

In [None]:
# notice how we only received descriptives for the numerical fields
data[['class']].describe()

In [None]:
# alternatively
data.describe(include=object)

In [None]:
data['class'].value_counts()

In [None]:
# let's define a function to do something trivial
def split_class(cl):
    return cl.split("-")[1]

In [None]:
data['species'] = data['class'].apply(split_class)

In [None]:
print(data.nonsense)

In [None]:
# drop the column we just created inplace
data.drop(columns='species', inplace=True)

In [None]:
# np.vectorize is a quicker and more effience way to apply a function taking multiple arguments to all rows of a dataframe
def make_nonsense(cl, sw, sl):
    if cl in ['Iris-setosa']:
        return sw * sl
    else:
        return sw * (sl * 0.05)

# np.vectorize takes a function and returns a vectorized function
data['nonsense'] = np.vectorize(make_nonsense)(data['class'], data['sepal_width'], data['sepal_length'])

In [None]:
print(data.nonsense)
data.drop(columns='nonsense', inplace=True)

## indexing

In [None]:
data.loc[data['class'] == 'Iris-setosa', :]

In [None]:
data.loc[data['class'] == 'Iris-setosa', ['sepal_width', 'sepal_length']]

In [None]:
# iloc for integer based indexing
data.iloc[:, 4]

In [None]:
data.iloc[0, :]

In [None]:
data.iloc[:2, :]

In [None]:
# isolating row indices meeting a condition
location = data.index[(data['class'] == 'foo')].tolist()

In [None]:
data.drop(index=location, inplace=True)

In [None]:
# creating dataframes from dictionaries

In [None]:
# Using the DataFrame function
a_dict = {'name': ['Randy', 'Gerald', 'Susan', 'Louise'],
         'age': [45, 45, 41, 48],
         'nickname': ['Rand', 'Ger', 'Susie', 'Weezy'],
         'id': ['871', '872', '873', '874']}
a_df = DataFrame(a_dict)
print(a_df)

In [None]:
# using the .from_dict method
a_df = DataFrame.from_dict(a_dict)
print(a_df)

In [None]:
# note that .from_dict has an 'orient' argument. If keys should be columns
# pass 'columns' to orient (default behavior).  If keys should be row indices 
# pass 'index' to this parameter.  See below...

In [None]:
a_df = DataFrame.from_dict(a_dict, orient='index')
print(a_df)

In [None]:
# let's go back to the original approach
a_df = DataFrame(a_dict)

In [None]:
a_df.set_index('id', inplace=True)

In [None]:
# notice that our row indices are now the id values.  The index object now has a name attribute.
a_df.index

In [None]:
# say we want to store our index values as a column and revert back to integer-based indexing.
a_df.reset_index(drop=False, inplace=True)

In [None]:
a_df.head()

In [None]:
# let's rename a column
a_df.rename(columns={'name': 'first_name'},
           inplace=True)

print(a_df.head())

## Let's try to parse some of our twitter returns into DataFrames

In [None]:
import requests
import json
import base64
from pprint import pprint

with open('twitter_keys.txt') as f:
    lines = f.readlines()
# bearer token
bearer_token=lines[2]

def connect_to_endpoint(url, headers, params):
    
    """function to execute v2 twitter api requests"""
    
    response = requests.get(url, headers=headers, params=params)
    
    print(response.status_code)
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
    return response.json()

In [None]:
url = "https://api.twitter.com/2/tweets/search/recent"

headers = {"Authorization": "Bearer {}".format(bearer_token)} 

query_params = {'query': '"Russell Wilson" has:mentions -is:retweet has:media is:verified',
                'max_results': 10,
                'tweet.fields': 'id,author_id,text,geo,conversation_id,entities',
                'expansions': 'author_id,geo.place_id',
                'user.fields': 'name,username,verified,location',
                'place.fields': 'country_code,geo,name,place_type'}

In [None]:
data = connect_to_endpoint(url, headers, query_params)

In [None]:
pprint(data)

In [None]:
# this will sometime be a mess depending on the structure of your data
df = DataFrame(data['data'])

In [None]:
df

In [None]:
# we've got some redundancy in our returned fields.  We'll drop two columns.
df.drop(columns=['conversation_id', 'edit_history_tweet_ids'],
       inplace=True)

In [None]:
df

In [None]:
# the entities variable isn't we'll suited for analysis.  
# let's take a closer look at it.  
print(type(df.loc[0, 'entities']))
pprint(df.loc[0, 'entities'])

In [None]:
# perhaps we just want the mentions
def count_mentions(ent):
    return len(ent['mentions'])

df['mention_count'] = np.vectorize(count_mentions)(df['entities'])

print(df[['mention_count']])

In [None]:
# perhaps we're done with the entities column
# let's drop it
df.drop(columns='entities',
       inplace=True)

In [None]:
# remember, our return object was a dictionary with three keys.
# the 'data' key points to the majority of the tweet info,
# but we also have 'meta' and 'includes'.  We'll ignore 'meta' for now.
# all of the includes data is associated with a 'users' key.

# these are the user.fields data we requested: name, username, verified, location
# not all records will necesarily have all keys.

pprint(data['includes'])

#user_df = DataFrame(data['includes']['users'])  

In [None]:
# let's create a user DataFrame
user_df = DataFrame(data['includes']['users'])  

In [None]:
user_df

In [None]:
# an alternative appraoch is to create a generator.  You'll see this in HW4
import copy

def user_generator(users):
    for user in users:
        user_copy = copy.deepcopy(user)
        yield user_copy

user_df2 = DataFrame(user_generator(data['includes']['users']))

In [None]:
user_df2

In [None]:
# let's merge these....we'll talk more about merging next week.

# notice that we have duplication between the two tables in terms of column names
# these will have _x, _y, etc. appended to the names. We can change names before
# to avoid this behavoir.  Afterwards is fine too.

df.rename(columns={'id':'tweet_id'}, inplace=True)

df = df.merge(user_df, how='left', left_on='author_id', right_on='id')

In [None]:
df