# Perdiz arrow points

In [1]:
# load analysis packages
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

# read data
data = pd.read_csv('perdizsite.csv')
data.head()

Unnamed: 0,spec,site,region,maxl,maxw,maxth,maxstl,maxstw
0,554,41cp12,north,25.4,12.18,3.82,5.75,3.84
1,555,41cp12,north,22.92,12.87,3.54,3.71,3.69
2,556,41cp12,north,24.09,11.87,3.61,5.15,4.78
3,559,41cp12,north,25.01,10.57,3.5,5.84,3.88
4,562,41cp12,north,22.1,10.45,3.47,3.77,3.43


### select features and response

In [2]:
# attributes for analysis
feature_cols = ['maxl', 'maxw', 'maxth', 'maxstl', 'maxstw']
X = data[feature_cols]

# cast from string to int
reg_num = {'north':0, 'south':1}
data['reg_num'] = data.region.map(reg_num)
data.head()
y = data.reg_num

### ensure that features and responses are numeric

In [3]:
X.dtypes

maxl      float64
maxw      float64
maxth     float64
maxstl    float64
maxstw    float64
dtype: object

In [4]:
y.dtypes

dtype('int64')

### split data for train/test

In [5]:
# split data into train/test sets (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(50, 5)
(17, 5)
(50,)
(17,)


### decrease sensitivity of algorithm to outliers through standardizing features

In [6]:
stdsc = StandardScaler()
X_train_std = stdsc.fit_transform(X_train)
X_test_std = stdsc.transform(X_test)

### k-fold cross validation of training dataset

In [7]:
logreg = LogisticRegression()
scores = cross_val_score(estimator = logreg, X = X_train_std, y = y_train,
                         cv = 10, n_jobs = 1)
print(scores)
print(scores.mean(), scores.std())

[1.  0.8 0.8 0.8 0.8 0.8 1.  1.  1.  1. ]
0.9 0.09999999999999998


### fit model on training set

In [8]:
logreg.fit(X_train_std, y_train)

LogisticRegression()

### make predictions + evaluate accuracy

In [9]:
y_pred = logreg.predict(X_test_std)
print(metrics.accuracy_score(y_test, y_pred))

# what attributes are driving the differences between the regions?
logreg.coef_

0.9411764705882353


array([[ 1.17917463,  0.80884852, -0.40829772, -0.89984492,  1.39380795]])