 # Functions - Key highlights

### 0. Function example - Simple

In [1]:
%reset -f
def my_func(x, y, z): 
    a = x + y 
    b = a * z 
    return b

### Let's call it - if x=1,y=2,z=3

In [2]:
my_func(z=2,x=1,y=2)

6

### Positional arguments vs. keyword arguments 

In [3]:
def my_func(x, y, z=7):
    print("x is " +str(x))
    a = x + y 
    b = a * z 
    return b

### Let's call it   if x=1,y=2

In [4]:
my_func(y =2, x=3)

x is 3


35

# MORE EXAMPLES 

In [5]:
import pandas as pd
import seaborn as sns
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [6]:
df=pd.read_csv("titanic.csv")
iris = pd.read_csv('iris.csv')

## 1. Data checks using functions - (describe,head,missing,columns)


### What we know already

In [7]:
df.describe()
df.isnull().sum()
df.dtypes
df.columns
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


### Data checks using functions -  multiple return statements

In [8]:
def myfun(df):
    r1 = df.describe()
    r2 = df.isnull().sum()
    r3 = df.dtypes
    r4 = df.columns
    r5 = df.head()
    return r1,r2,r3,r4,r5
myfun(df)

(         survived      pclass         age       sibsp       parch        fare
 count  891.000000  891.000000  714.000000  891.000000  891.000000  891.000000
 mean     0.383838    2.308642   29.699118    0.523008    0.381594   32.204208
 std      0.486592    0.836071   14.526497    1.102743    0.806057   49.693429
 min      0.000000    1.000000    0.420000    0.000000    0.000000    0.000000
 25%      0.000000    2.000000   20.125000    0.000000    0.000000    7.910400
 50%      0.000000    3.000000   28.000000    0.000000    0.000000   14.454200
 75%      1.000000    3.000000   38.000000    1.000000    0.000000   31.000000
 max      1.000000    3.000000   80.000000    8.000000    6.000000  512.329200,
 survived         0
 pclass           0
 sex              0
 age            177
 sibsp            0
 parch            0
 fare             0
 embarked         2
 class            0
 who              0
 adult_male       0
 deck           688
 embark_town      2
 alive            0
 alone  

#### Question: Can you use above function to check the iris data frame?

In [9]:
myfun(iris)

(       sepal_length  sepal_width  petal_length  petal_width
 count    150.000000   150.000000    150.000000   150.000000
 mean       5.843333     3.057333      3.758000     1.199333
 std        0.828066     0.435866      1.765298     0.762238
 min        4.300000     2.000000      1.000000     0.100000
 25%        5.100000     2.800000      1.600000     0.300000
 50%        5.800000     3.000000      4.350000     1.300000
 75%        6.400000     3.300000      5.100000     1.800000
 max        7.900000     4.400000      6.900000     2.500000, sepal_length    0
 sepal_width     0
 petal_length    0
 petal_width     0
 species         0
 dtype: int64, sepal_length    float64
 sepal_width     float64
 petal_length    float64
 petal_width     float64
 species          object
 dtype: object, Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width',
        'species'],
       dtype='object'),    sepal_length  sepal_width  petal_length  petal_width species
 0           5.1        

# 2. Aggregation using functions 

### What's the mean survival rate by gender/class/embarked...?

In [10]:
df.groupby(["sex"])["survived"].mean()

sex
female    0.742038
male      0.188908
Name: survived, dtype: float64

### Aggregation using function with return

In [11]:
def myfun2(df,group,var):
    result = df.groupby([group])[var].mean()
    return result
myfun2(df,"sex","survived")

sex
female    0.742038
male      0.188908
Name: survived, dtype: float64

### Using a list in a function with multiple results 

In [12]:
mylist=["sex","class"]
myData = [myfun2(df, group, 'survived') for group in mylist]
myData

[sex
 female    0.742038
 male      0.188908
 Name: survived, dtype: float64, class
 First     0.629630
 Second    0.472826
 Third     0.242363
 Name: survived, dtype: float64]

# 3. Apply a regression model using functions

### This is what we normally write

In [13]:
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [14]:
#setup target and feature variables
target = df.survived
features = df[["pclass","sibsp","parch"]]

# Create training and test sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size = 0.3, random_state=42)

# Logistic Regression : 
logi_reg = LogisticRegression()
# Fit the regressor to the training data
logi_reg.fit(X_train, y_train)

# Predict on the test data: y_pred
y_pred = logi_reg.predict(X_test)

# Score / Metrics
accuracy = logi_reg.score(X_test, y_test)

print(accuracy)

0.701492537313


### Apply a function

In [15]:
def myreg(df,yvar,xvars):
    
        #setup target and feature variables
        target = yvar
        features = xvars

        # Create training and test sets
        X_train, X_test, y_train, y_test = train_test_split(features, target, test_size = 0.3, random_state=42)

        # Logistic Regression : 
        logi_reg = LogisticRegression()
        # Fit the regressor to the training data
        logi_reg.fit(X_train, y_train)

        # Predict on the test data: y_pred
        y_pred = logi_reg.predict(X_test)

        # Score / Metrics
        accuracy = logi_reg.score(X_test, y_test)

        return accuracy        

In [16]:
myreg(df=df,yvar = df["survived"],xvars=df[["pclass","sibsp","parch"]])

0.70149253731343286

### Option 2

In [17]:
myfeatures = df[["pclass","sibsp","parch"]]
mytar = df["survived"]
myreg(df=df,yvar = mytar,xvars=myfeatures)

0.70149253731343286

###  Question: Can you apply above function on the iris data frame? 
#### target = newtar   features = sepal_length

In [18]:
# let's tidy up the target variable first
iris['newtar'] = np.where(iris['species']=='setosa', "se", "notse")

In [19]:
myfeatures = iris[['sepal_length']]

mytar = iris["newtar"]

myfeatures = myfeatures
myreg(df= iris,yvar = mytar, xvars = myfeatures)

0.57777777777777772

In [20]:
iris

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,newtar
0,5.1,3.5,1.4,0.2,setosa,se
1,4.9,3.0,1.4,0.2,setosa,se
2,4.7,3.2,1.3,0.2,setosa,se
3,4.6,3.1,1.5,0.2,setosa,se
4,5.0,3.6,1.4,0.2,setosa,se
5,5.4,3.9,1.7,0.4,setosa,se
6,4.6,3.4,1.4,0.3,setosa,se
7,5.0,3.4,1.5,0.2,setosa,se
8,4.4,2.9,1.4,0.2,setosa,se
9,4.9,3.1,1.5,0.1,setosa,se


In [21]:
myfeatures = iris[["sepal_length"]] # double square bracket because scikit learn expects 2D matrix
mytar = iris["newtar"]
myreg(df= iris,yvar = mytar, xvars = myfeatures)

0.57777777777777772