## Exit Tickets
* Completing Exit Tickets

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import requests
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

### working with lambda functions
* a Python lambda function is a single expression that doesn't contain any statements

In [2]:
path = './data/bikeshare.csv'
bikes = pd.read_csv(path, index_col='datetime', parse_dates=True)
bikes.head(1)

Unnamed: 0_level_0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2011-01-01,1,0,0,1,9.84,14.395,81,0.0,3,13,16


In [35]:
# apply some method
bikes.temp.map(lambda x: type(x)).head(1)

datetime
2011-01-01    <class 'float'>
Name: temp, dtype: object

In [36]:
# do some math
bikes.temp.map(lambda x: (x * (9/5)) + 32).head(1)

datetime
2011-01-01    49.712
Name: temp, dtype: float64

In [37]:
# if else: <Return Value if condition is True> if <condition> else <Return Value if condition is False>
bikes.temp.map(lambda x: 1 if x>10 else 0).head(1)

datetime
2011-01-01    0
Name: temp, dtype: int64

### Working with apply

In [2]:
# import the drone data - you might need to install requests to run this locally
response = requests.get("http://api.dronestre.am/data")
json_data = response.json()
drone_df = pd.DataFrame(json_data['strike'])
drone_df.columns

Index(['_id', 'number', 'country', 'date', 'narrative', 'town', 'location',
       'deaths', 'deaths_min', 'deaths_max', 'civilians', 'injuries',
       'children', 'tweet_id', 'bureau_id', 'bij_summary_short', 'bij_link',
       'target', 'lat', 'lon', 'articles', 'names'],
      dtype='object')

In [4]:
# let's assume the target is a(n) intended target of a drone attack and the names are casualties
# we would feel a lot better about this if it were confirmed in documentation
subset = drone_df[['target','names']]
subset.head(10)

Unnamed: 0,target,names
0,,"[Qa'id Salim Sinan al-Harithi, Abu Ahmad al-Hi..."
1,Nek Mohammed,"[Nek Mohammad, Fakhar Zaman, Azmat Khan, Marez..."
2,Haitham al-Yemeni,"[Haitham al-Yemeni, Samiullah Khan]"
3,Abu Hamza Rabia,[]
4,Abu Hamza Rabia,"[Abu Hamza Rabia, Suleiman al-Moghrabi, Amer A..."
5,,[]
6,Ayman al-Zawahiri; Abu Khabab al-Masri; Abd Ra...,[]
7,Maulvi Liaqat,"[Maulvi Liaqat, Mohammad Tahir (16), Maulvi Kh..."
8,,"[Katoor Khan, Taj Alam]"
9,Maulvi Noor Mohammed,[]


In [5]:
# we have sufficiently few values that we can flag cleanup by eyeballing issues
# if we had more values we could sample unique values and do the same
# we see some strings are names separated by semicolons
# some names are followed by question marks
# some values are empty
subset.target.unique()

array(['', 'Nek Mohammed', 'Haitham al-Yemeni', 'Abu Hamza Rabia',
       'Ayman al-Zawahiri; Abu Khabab al-Masri; Abd Rahman al-Masri al-Maghribi; Abu Ubeidah al-Masri; Marwan al-Suri; Khalid Habib; Abdul Hadi al-Iraqi',
       'Maulvi Liaqat', 'Maulvi Noor Mohammed', 'Jalalludin Haqqani',
       'Shaykh Issa al-Masri', 'Abu Laith al-Libi',
       'Abu Sulayman al-Jazairi', 'Baitullah Mehsud',
       'Abu Khabab al-Masri', 'Abdul Rehman', 'Abu Wafa al-Saudi',
       'Qari Imran', 'Hafiz Sahar Gul', 'Khalid Habib', 'Maulvi Nazir',
       'Abu Jihad al-Masri; Abdur Rehman abu Akash', 'Hafiz Gul Bahadar',
       'Hakimullah Mehsud; Baitullah Mehsud', 'Abdullah Azzam al-Saudi',
       'Zubair al-Masri', 'Baitullah Mehsud ally', 'Bahram Khan Kochi',
       'Taj Ali Khan', 'Abu Kasha', 'Hakimullah Mehsud', 'Tariq Khan',
       'Maulvi Nazir; Malang Wazir', 'Malang Wazir', 'Niaz Wali',
       'Hafiz Gul Bahadur', 'Maulvi Nazir?', 'Ilyas Kashmiri',
       'Nader al Shaddadi', 'Saifullah', 'Fa

### cleanup

In [6]:
# get rid of the special characters
subset['target'] = subset.target.replace('?','')
# set empty strings to null values
subset.loc[subset[subset.target == ''].index,'target']=np.nan

In [9]:
# make a sample dataframe to test
sample = subset.iloc[4:5,:].copy()
sample

Unnamed: 0,target,names
4,Abu Hamza Rabia,"[Abu Hamza Rabia, Suleiman al-Moghrabi, Amer A..."


In [10]:
# The split() method is used to split a string into an array of substrings, and returns the new array

# elements in target are separted by semicolons
sample.target.map(lambda x: x.split('; '))

4    [Abu Hamza Rabia]
Name: target, dtype: object

In [17]:
# this looks like a list of a bunch of discrete elements
sample.names.map(lambda x: x)

4    [Abu Hamza Rabia, Suleiman al-Moghrabi, Amer A...
Name: names, dtype: object

In [18]:
# but when we index it, we see it's actually just one element
sample.names.map(lambda x: x[0])

4    Abu Hamza Rabia, Suleiman al-Moghrabi, Amer Az...
Name: names, dtype: object

In [19]:
# that we need to split
sample.names.map(lambda x: x[0].split(', '))

4    [Abu Hamza Rabia, Suleiman al-Moghrabi, Amer A...
Name: names, dtype: object

In [8]:
# with apply, we typically pass in a row  - we could also iterate through columns - as an input
def successful_attack(row):
    # target is a string so we can use the string method split to turn it into a list and iterate through the elements
    for target in row['target'].split('; '):
        # names is a list; we index the list to get the names and split them into a list
        if target in row['names'][0].split(', '):
            # if target is in the list of names, the attack was successful
            return 1
    # otherwise it was not
    return 0

In [9]:
# function works on sample dataframe
sample['success'] = sample.apply(successful_attack, axis=1)
sample

Unnamed: 0,target,names,success
4,Abu Hamza Rabia,"[Abu Hamza Rabia, Suleiman al-Moghrabi, Amer A...",1


In [10]:
# only run function on non-null values
mask = subset.target.notnull()
# function works on entire subset... but yields three successful cases 
subset['success'] = subset[mask].apply(successful_attack, axis=1)
subset.success.value_counts(dropna=False)

NaN     577
 0.0     67
 1.0      3
Name: success, dtype: int64

In [11]:
# while most of the names are actually empty lists, we can see the first example includes the target but isn't 
# captured. With more cleanup we'd catch some more cases.
subset[subset.success==0].head(3)

Unnamed: 0,target,names,success
1,Nek Mohammed,"[Nek Mohammad, Fakhar Zaman, Azmat Khan, Marez...",0.0
3,Abu Hamza Rabia,[],0.0
6,Ayman al-Zawahiri; Abu Khabab al-Masri; Abd Ra...,[],0.0


## Review questions
1. What is the difference betwen map and apply?
2. At a high level, what is the appropriate use case for apply?

# Linear Regression

statquest video: https://www.youtube.com/watch?v=PaFPbb66DxQ&feature=youtu.be

In [3]:
bikes.head(1)

Unnamed: 0_level_0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2011-01-01,1,0,0,1,9.84,14.395,81,0.0,3,13,16


In [4]:
# first, separate features (stored in a dataframe called X) and target variable (stored in a series called y)
feature_cols = ['temp']
X = bikes[feature_cols]
y = bikes['count']

In [5]:
# feature matrix must be a dataframe
X.head(1)

Unnamed: 0_level_0,temp
datetime,Unnamed: 1_level_1
2011-01-01,9.84


In [6]:
# target variable must be a series
y.head(1)

datetime
2011-01-01    16
Name: count, dtype: int64

### Using scikit-learn

In [9]:
# just like with pandas, numpy, matplotlib, and seaborn, import the objects you'll need from sklearn
from sklearn.linear_model import LinearRegression
# Make an instance of a LinearRegression object.
lr = LinearRegression()
# passing in our features and target variable, fit our model
lr.fit(X, y)

LinearRegression()

In [10]:
# Once fit, we can look at our model's attributes
print(lr.intercept_)
print(lr.coef_)

6.046212959616781
[9.17054048]


In [11]:
X.temp.values[0]

9.84

#### The generic linear regression equation is
$y = \beta_0 + \beta_1x_1$

- $y$ is the target.
- $\beta_0$ is the intercept.
- $\beta_1$ is the coefficient [in middle school math terms, the slope] for $x_1$ (the first feature).

### use the information we've obtained from our regression object to yield a prediction for x when x=9.84

In [13]:
9.84*9.17+6.046

96.2788

# Scoring

In [14]:
# scoring the difference between actual values and predicted values
# in scoring predictions, you might want to maximize something (e.g. accuracy) or minimize something (error)
# root mean squared error is a loss function that evaluates the average squared error of our predictions
from sklearn.metrics import mean_squared_error
# we have an averge error of 168
y_pred = lr.predict(X)
np.sqrt(mean_squared_error(y, y_pred))

166.44886243326746

In [9]:
# our null model, for comparison:
# if we guessed the count would be average each day, we would be off by 181
np.sqrt(mean_squared_error(bikes['count'], bikes['count'].map(lambda x: bikes['count'].mean())))

181.1361335742659

# Working in groups, create a framework for how we can think about linear regressions and machine learning algorithms generally

In [None]:
#identify target variable
#run many bivariate linear regressions from all variables against the target variable, and measure the errors
#
#
#

# Train/Test Split

* We need a procedure to estimate how well a model is likely to perform on out-of-sample data and use that to choose between models.
* With a train test split, we split our X and y values into into a single training group and a single test group.
* Our X and y train values apply to the same observations and we use them to fit our model
* Our X and y test values apply to the same observations and use them to test our model

![train_test_split](./assets/train_test_split.png)


### take a minute to think about what seems like the benefit of doing this?

In [10]:
from sklearn.model_selection import train_test_split

# split data into train and test - default split is 75/25 train/test; you can modify
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [11]:
print (X_train.shape)
print (y_train.shape)
print (X_test.shape)
print (y_test.shape)

(8164, 1)
(8164,)
(2722, 1)
(2722,)


In [12]:
linreg = LinearRegression()
# fit on train data
linreg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [13]:
# predict and score on unseen data
y_pred = linreg.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred))

166.1008233806573

### In groups, try this on a new dataset

In [3]:
sac = pd.read_csv('./data/sacramento_real_estate_transactions.csv')
sac.head(1)

Unnamed: 0,street,city,zip,state,beds,baths,sq__ft,type,sale_date,price,latitude,longitude
0,3526 HIGH ST,SACRAMENTO,95838,CA,2,1,836,Residential,Wed May 21 00:00:00 EDT 2008,59222,38.631913,-121.434879


### create a feature matrix (X) and target matrix (y), where your feature is a sq_ft and target is price

### create a train test split

### fit your regression on your training data

### predict on your test features

### score your predictions against your y_test values

### make a scatter plot ('a residual plot') and interpret how your predictions perform