# Scikit-Learn Example Using Stata's auto2.dta

In [1]:
import pandas as pd
import numpy as np
import sys

In [2]:
sys.path.append('/gits/conrpt')
import conrpt

In [3]:
# Load example data provided by Stata
exfile = pd.read_stata('http://www.stata-press.com/data/r15/auto2.dta')

In [4]:
# Use pd.set_option to conserve display space.
pd.set_option('display.max_rows', 8)

# Generate a dummy variable for later use.
# For Stata users, equivalent code: gen fgn = (foreign == "Foreign")
exfile['fgn'] = np.where(exfile['foreign']=='Foreign', 1, 0)

# Check results.
exfile

Unnamed: 0,make,price,mpg,rep78,headroom,trunk,weight,length,turn,displacement,gear_ratio,foreign,fgn
0,AMC Concord,4099,22,Average,2.5,11,2930,186,40,121,3.58,Domestic,0
1,AMC Pacer,4749,17,Average,3.0,11,3350,173,40,258,2.53,Domestic,0
2,AMC Spirit,3799,22,,3.0,12,2640,168,35,121,3.08,Domestic,0
3,Buick Century,4816,20,Average,4.5,16,3250,196,40,196,2.93,Domestic,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
70,VW Diesel,5397,41,Excellent,3.0,15,2040,155,35,90,3.78,Foreign,1
71,VW Rabbit,4697,25,Good,3.0,15,1930,155,35,89,3.78,Foreign,1
72,VW Scirocco,6850,25,Good,2.0,16,1990,156,36,97,3.78,Foreign,1
73,Volvo 260,11995,17,Excellent,2.5,14,3170,193,37,163,2.98,Foreign,1


In [5]:
# Feature Matrix
X = exfile[['price','mpg','length']]
# Target Matrix
y = exfile[['fgn']]

In [6]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(max_depth=4, criterion='entropy')
clf.fit(X, y)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=4,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [7]:
import graphviz
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from IPython.display import Image 
dot_data = export_graphviz(clf, out_file=None,
                          impurity=False,
                          filled=True,
                          rounded=True,
                          special_characters=True,
                           class_names=["Domestic","Foreign"],
                          feature_names=list(X))
# graphviz.Source(dot_data)

In [8]:
from sklearn.metrics import (accuracy_score, precision_score, 
                             recall_score, f1_score, log_loss)

print(f'training accuracy = {accuracy_score(y, clf.predict(X))}')
print(f'training precision = {precision_score(y, clf.predict(X))}')
print(f'training recall = {recall_score(y, clf.predict(X))}')
print(f'training f1 score = {f1_score(y, clf.predict(X))}')
print(f'training log loss = {log_loss(y, clf.predict(X))}')

training accuracy = 0.9324324324324325
training precision = 1.0
training recall = 0.7727272727272727
training f1 score = 0.8717948717948718
training log loss = 2.3337011077642362


In [9]:
from sklearn.metrics import confusion_matrix

# Add variable that indicates prediction result.
exfile['fgn_pred'] = clf.predict(X)

In [10]:
pd.set_option('display.max_rows', 74)

# Display opservations with incorrect predictions.
exfile[exfile['fgn'] != exfile['fgn_pred']]

Unnamed: 0,make,price,mpg,rep78,headroom,trunk,weight,length,turn,displacement,gear_ratio,foreign,fgn,fgn_pred
59,Fiat Strada,4296,21,Average,2.5,16,2130,161,36,105,3.37,Foreign,1,0
62,Mazda GLC,3995,30,Good,3.5,11,1980,154,33,86,3.73,Foreign,1,0
64,Renault Le Car,3895,26,Average,3.0,10,1830,142,34,79,3.72,Foreign,1,0
65,Subaru,3798,35,Excellent,2.5,11,2050,164,36,97,3.81,Foreign,1,0
67,Toyota Corolla,3748,31,Excellent,3.0,9,2200,165,35,97,3.21,Foreign,1,0


In [11]:
exfile

Unnamed: 0,make,price,mpg,rep78,headroom,trunk,weight,length,turn,displacement,gear_ratio,foreign,fgn,fgn_pred
0,AMC Concord,4099,22,Average,2.5,11,2930,186,40,121,3.58,Domestic,0,0
1,AMC Pacer,4749,17,Average,3.0,11,3350,173,40,258,2.53,Domestic,0,0
2,AMC Spirit,3799,22,,3.0,12,2640,168,35,121,3.08,Domestic,0,0
3,Buick Century,4816,20,Average,4.5,16,3250,196,40,196,2.93,Domestic,0,0
4,Buick Electra,7827,15,Good,4.0,20,4080,222,43,350,2.41,Domestic,0,0
5,Buick LeSabre,5788,18,Average,4.0,21,3670,218,43,231,2.73,Domestic,0,0
6,Buick Opel,4453,26,,3.0,10,2230,170,34,304,2.87,Domestic,0,0
7,Buick Regal,5189,20,Average,2.0,16,3280,200,42,196,2.93,Domestic,0,0
8,Buick Riviera,10372,16,Average,3.5,17,3880,207,43,231,2.93,Domestic,0,0
9,Buick Skylark,4082,19,Average,3.5,13,3400,200,42,231,3.08,Domestic,0,0


In [13]:
# Evaluate the prediction with conrpt
conrpt.conrpt(exfile[['fgn','fgn_pred']])


Notes: ObservedPos: 22, ObservedNeg: 52, & ObservedTot: 74, Prevalence: 29.730


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df['srtr'] = numpy.random.randint(1, 101, size=len(df))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df[new_col_name] = numpy.where(df['srtr'] < coin, 1, 0)


Unnamed: 0,Results,Perfect,fgn_pred,25coin,50coin,75coin
0,TestedPos,22.0,17.0,25.0,38.0,58.0
1,TestedNeg,52.0,57.0,49.0,36.0,16.0
2,TestedTot,74.0,74.0,74.0,74.0,74.0
3,TruePos,22.0,17.0,5.0,12.0,17.0
4,TrueNeg,52.0,52.0,32.0,26.0,11.0
5,FalesPos,0.0,5.0,17.0,10.0,5.0
6,FalseNeg,0.0,0.0,20.0,26.0,41.0
7,Sensitivity,1.0,0.773,0.227,0.545,0.773
8,Specificity,1.0,1.0,0.615,0.5,0.212
9,PosPredVal,1.0,0.773,0.227,0.545,0.773


In [14]:
# Normalize a single prediction result.
def norm_pred(coded_pred):
    normed = 'foreign'
    if coded_pred == 0:
        normed = 'domestic'
    return normed

# Generate a single prediction for price = 4300, mpg = 20, length = 190.
print('Car with price = 4300, mpg = 20, length = 190 predicted to be {}'.format(norm_pred(
    clf.predict(np.array([4300,20,190]).reshape(1, -1)))))

Car with price = 4300, mpg = 20, length = 190 predicted to be domestic


In [15]:
# Generate a single prediction for price = 4300, mpg = 20, length = 190.
print('Car with price = 4100, mpg = 20, length = 190 predicted to be {}'.format(norm_pred(
    clf.predict(np.array([4100,20,190]).reshape(1, -1)))))

Car with price = 4100, mpg = 20, length = 190 predicted to be domestic


In [16]:
# Generate a single prediction for price = 4300, mpg = 20, length = 190.
print('Car with price = 9700, mpg = 20, length = 190 predicted to be {}'.format(norm_pred(
    clf.predict(np.array([9700,20,190]).reshape(1, -1)))))

Car with price = 9700, mpg = 20, length = 190 predicted to be foreign
