Pick either classification or logistic regression (see below).
Write a short 200-to-500 word summary of your project along with the findings. 

Submit all of your content - code, data, words - as a GitHub repository. Your text should be written in markdown as a README.md.

## CLASSIFICATION

Obtain 1000+ things. You can get them via scraping, using an API, or even downloading a few large texts and using .split(".") to break them into sentences.  Either text or numeric is fine.

If unlabeled, label at least 100 of them and write a classifier to label the rest.

If labeled, write a classifier to automatically classify them.

Try several classifiers to find the 'best' results according to accuracy score and confusion matrix.

Find the most important features.

In [1]:
import pandas as pd



In [4]:
df = pd.read_csv('2018_Central_Park_Squirrel_Census_-_Squirrel_Data.csv')
df.head()

Unnamed: 0,X,Y,Unique Squirrel ID,Hectare,Shift,Date,Hectare Squirrel Number,Age,Primary Fur Color,Highlight Fur Color,...,Kuks,Quaas,Moans,Tail flags,Tail twitches,Approaches,Indifferent,Runs from,Other Interactions,Lat/Long
0,-73.956134,40.794082,37F-PM-1014-03,37F,PM,10142018,3,,,,...,False,False,False,False,False,False,False,False,,POINT (-73.9561344937861 40.7940823884086)
1,-73.968857,40.783783,21B-AM-1019-04,21B,AM,10192018,4,,,,...,False,False,False,False,False,False,False,False,,POINT (-73.9688574691102 40.7837825208444)
2,-73.974281,40.775534,11B-PM-1014-08,11B,PM,10142018,8,,Gray,,...,False,False,False,False,False,False,False,False,,POINT (-73.97428114848522 40.775533619083)
3,-73.959641,40.790313,32E-PM-1017-14,32E,PM,10172018,14,Adult,Gray,,...,False,False,False,False,False,False,False,True,,POINT (-73.9596413903948 40.7903128889029)
4,-73.970268,40.776213,13E-AM-1017-05,13E,AM,10172018,5,Adult,Gray,Cinnamon,...,False,False,False,False,False,False,False,False,,POINT (-73.9702676472613 40.7762126854894)


### Classifier to determine if a squirrel is a juvenile or an adult based on if they approach

In [5]:
df.Age.value_counts()

Adult       2568
Juvenile     330
?              4
Name: Age, dtype: int64

In [7]:
df['is_juvenile'] = (df.Age == 'Juvenile').astype(int)
df.is_juvenile.value_counts()

0    2693
1     330
Name: is_juvenile, dtype: int64

In [9]:
df.Age = df.Age.astype('U')

In [15]:
df.Approaches = df.Approaches.astype('string')

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Make a vectorizer
vectorizer = TfidfVectorizer()

# Learn and count the words in df.content
matrix = vectorizer.fit_transform(df.Approaches)

# Convert the matrix of counts to a dataframe
words_df = pd.DataFrame(matrix.toarray(),
                        columns=vectorizer.get_feature_names())
words_df

Unnamed: 0,false,true
0,1.0,0.0
1,1.0,0.0
2,1.0,0.0
3,1.0,0.0
4,1.0,0.0
...,...,...
3018,1.0,0.0
3019,1.0,0.0
3020,1.0,0.0
3021,1.0,0.0


In [18]:
X = words_df
y = df.is_juvenile

In [19]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y)

#### RandomForestClassifier

In [20]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

RandomForestClassifier()

In [21]:
clf.score(X_test, y_test)

0.8822751322751323

In [22]:
df.is_juvenile.value_counts(normalize=True)

0    0.890837
1    0.109163
Name: is_juvenile, dtype: float64

In [23]:
from sklearn.metrics import confusion_matrix

y_true = y_test
y_pred = clf.predict(X_test)
matrix = confusion_matrix(y_true, y_pred)

label_names = pd.Series(['not juvenile', 'juvenile'])
pd.DataFrame(matrix,
     columns='Predicted ' + label_names,
     index='Is ' + label_names)

Unnamed: 0,Predicted not juvenile,Predicted juvenile
Is not juvenile,667,0
Is juvenile,89,0


#### DecisionTreeClassifier


In [24]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier(max_depth=5)
clf.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=5)

In [25]:
clf.score(X_test, y_test)

0.8822751322751323

In [26]:
from sklearn.metrics import confusion_matrix

y_true = y_test
y_pred = clf.predict(X_test)
matrix = confusion_matrix(y_true, y_pred)

label_names = pd.Series(['not juvenile', 'juvenile'])
pd.DataFrame(matrix,
     columns='Predicted ' + label_names,
     index='Is ' + label_names)

Unnamed: 0,Predicted not juvenile,Predicted juvenile
Is not juvenile,667,0
Is juvenile,89,0


#### Perceptron

In [27]:
from sklearn.linear_model import Perceptron

clf = Perceptron(max_iter=4000)
clf.fit(X_train, y_train)

Perceptron(max_iter=4000)

In [28]:
clf.score(X_test, y_test)

0.8822751322751323

In [29]:
from sklearn.metrics import confusion_matrix

y_true = y_test
y_pred = clf.predict(X_test)
matrix = confusion_matrix(y_true, y_pred)

label_names = pd.Series(['not juvenile', 'juvenile'])
pd.DataFrame(matrix,
     columns='Predicted ' + label_names,
     index='Is ' + label_names)

Unnamed: 0,Predicted not juvenile,Predicted juvenile
Is not juvenile,667,0
Is juvenile,89,0


#### CalibratedClassifierCV

In [31]:
from sklearn.calibration import CalibratedClassifierCV

clf = CalibratedClassifierCV()
clf.fit(X_train, y_train)

CalibratedClassifierCV()

In [32]:
clf.score(X_test, y_test)

0.8822751322751323

In [34]:
from sklearn.metrics import confusion_matrix

y_true = y_test
y_pred = clf.predict(X_test)
matrix = confusion_matrix(y_true, y_pred)

label_names = pd.Series(['not juvenile', 'juvenile'])
pd.DataFrame(matrix,
     columns='Predicted ' + label_names,
     index='Is ' + label_names)

Unnamed: 0,Predicted not juvenile,Predicted juvenile
Is not juvenile,667,0
Is juvenile,89,0


### Examining juvenile data only and finding the most important features

In [35]:
juvenile_df = df[df.Age.str.contains("Juvenile", na=False)]
juvenile_df.head()

Unnamed: 0,X,Y,Unique Squirrel ID,Hectare,Shift,Date,Hectare Squirrel Number,Age,Primary Fur Color,Highlight Fur Color,...,Quaas,Moans,Tail flags,Tail twitches,Approaches,Indifferent,Runs from,Other Interactions,Lat/Long,is_juvenile
17,-73.963818,40.792417,32A-PM-1013-03,32A,PM,10132018,3,Juvenile,Gray,Cinnamon,...,False,False,False,False,False,True,False,,POINT (-73.9638179439747 40.7924173263904),1
34,-73.95657,40.790256,33H-AM-1019-02,33H,AM,10192018,2,Juvenile,Gray,Cinnamon,...,False,False,False,False,False,False,False,,POINT (-73.9565700386162 40.7902561000937),1
38,-73.957465,40.789251,31H-PM-1008-02,31H,PM,10082018,2,Juvenile,Gray,Cinnamon,...,False,False,False,True,False,True,False,,POINT (-73.9574648097543 40.78925084286221),1
47,-73.967563,40.781348,19D-AM-1007-01,19D,AM,10072018,1,Juvenile,Gray,,...,False,False,False,False,False,False,False,fenced off area can't approach,POINT (-73.9675634326877 40.7813477352507),1
48,-73.958497,40.798289,40B-AM-1019-03,40B,AM,10192018,3,Juvenile,Gray,White,...,False,False,False,True,True,False,False,,POINT (-73.9584970643213 40.7982886348696),1


In [37]:
df['Primary Fur Color'].value_counts()

Gray        2473
Cinnamon     392
Black        103
Name: Primary Fur Color, dtype: int64

In [40]:
df['Primary Fur Color'] = df['Primary Fur Color'].dropna

In [49]:
juvenile_df['is_skittish'] = juvenile_df['Other Interactions'].str.contains("run")
juvenile_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,X,Y,Unique Squirrel ID,Hectare,Shift,Date,Hectare Squirrel Number,Age,Primary Fur Color,Highlight Fur Color,...,Tail twitches,Approaches,Indifferent,Runs from,Other Interactions,Lat/Long,is_juvenile,is_gray,is_running,is_skittish
17,-73.963818,40.792417,32A-PM-1013-03,32A,PM,10132018,3,Juvenile,Gray,Cinnamon,...,False,False,True,False,,POINT (-73.9638179439747 40.7924173263904),1,True,,
34,-73.95657,40.790256,33H-AM-1019-02,33H,AM,10192018,2,Juvenile,Gray,Cinnamon,...,False,False,False,False,,POINT (-73.9565700386162 40.7902561000937),1,True,,
38,-73.957465,40.789251,31H-PM-1008-02,31H,PM,10082018,2,Juvenile,Gray,Cinnamon,...,True,False,True,False,,POINT (-73.9574648097543 40.78925084286221),1,True,,
47,-73.967563,40.781348,19D-AM-1007-01,19D,AM,10072018,1,Juvenile,Gray,,...,False,False,False,False,fenced off area can't approach,POINT (-73.9675634326877 40.7813477352507),1,True,False,False
48,-73.958497,40.798289,40B-AM-1019-03,40B,AM,10192018,3,Juvenile,Gray,White,...,True,True,False,False,,POINT (-73.9584970643213 40.7982886348696),1,True,,


In [46]:
juvenile_df['Other Interactions'].value_counts()


watching me from tree - curious                               2
indifferent then runs from                                    2
runs from (lawnmower)                                         1
allowed me to approach within 10 ft                           1
keeps to tree,drops down on side away from people             1
ran from,but now approaching. kind of weirding me out.        1
froze                                                         1
me!                                                           1
runs from (people)                                            1
watching me from tree,but calm                                1
runs from (ran into treebrush)                                1
fenced off area can't approach                                1
runs off                                                      1
approaches (very friendly)                                    1
was in a tree,so i couldn't really tell                       1
runs from (goes up a tree)              

In [50]:
juvenile_df['is_skittish'].value_counts()

False    29
True     10
Name: is_skittish, dtype: int64

In [75]:
train_df = pd.DataFrame({
    'is_skittish': juvenile_df.is_skittish,
    'run': juvenile_df['Other Interactions'].str.contains("run", na=False).astype(int),
    'ran': juvenile_df['Other Interactions'].str.contains("ran", na=False).astype(int),
    'runs': juvenile_df['Other Interactions'].str.contains("runs", na=False).astype(int),
    'cautious': juvenile_df['Other Interactions'].str.contains("cautious", na=False).astype(int),
    'pause': juvenile_df['Other Interactions'].str.contains("pause", na=False).astype(int),
    'scared': juvenile_df['Other Interactions'].str.contains("scared", na=False).astype(int),
    'froze': juvenile_df['Other Interactions'].str.contains("froze", na=False).astype(int)
})

In [76]:
train_df = train_df.dropna()

In [80]:
X = train_df.drop(columns=['is_skittish'])
y = train_df.is_skittish
y=y.astype('int')

In [81]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [82]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier(max_depth=5)
clf.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=5)

In [83]:
clf.score(X_test, y_test)

1.0

In [84]:
from sklearn.metrics import confusion_matrix

y_true = y_test
y_pred = clf.predict(X_test)
matrix = confusion_matrix(y_true, y_pred)

label_names = pd.Series(['not juvenile', 'juvenile'])
pd.DataFrame(matrix,
     columns='Predicted ' + label_names,
     index='Is ' + label_names)

Unnamed: 0,Predicted not juvenile,Predicted juvenile
Is not juvenile,7,0
Is juvenile,0,3


In [85]:
import eli5

feature_names=list(X.columns)
eli5.show_weights(clf, feature_names=feature_names, show=['description', 'feature_importances'])

Weight,Feature
1.0,run
0.0,froze
0.0,scared
0.0,pause
0.0,cautious
0.0,runs
0.0,ran
